# Deprecated

It turned out much more time consumig to parse filenames (created by various people) and connect them to single database of entries.
New strategy is to keep metadata info per folder in `sample_meta.csv` and rather update / create this file when new samples are added to a folder.

In [28]:
import glob
import os
import csv
from pprint import pprint

In [29]:
import pathlib

In [30]:
root = pathlib.Path('..')
fcs_dir = root / 'input_data' / 'original'

In [31]:
# Makes a strong assumption that everything not from gut (SCS) is from PBMCs
sm =[dict(path = x.relative_to(root),
      filename = x.stem,
      biosource = 'SCS' if 'SCS' in str(x.parent) else 'PBMC'
     ) for x in fcs_dir.glob('**/*.fcs')]

## For now, some fcs files without match are dropped

The flu samples are not yet in Asbjorns sample list and not critical do current workflow.
Therefore, instead of spending time on adding them now, I wait until Asbjorn has added them (or until I have done what I should for the paper)

an example of such a sample is this `../input_data/original/Autoimmune control and Flu CD4/Autoimmune and control/BBS_1189 All CD4.fcs`

In [32]:
import regex
auto_phenotype = 'AutoPhenotype'
ced_status = dict(U='untreated', T='GFD', H='control', C='Challenge')
sample_category = dict(
    pre = 'full',
    pos = 'tetramer+',
    neg = 'tetramer-',
    all = 'full',
    phenotype = auto_phenotype
)    

pat = dict(
    cdonor = regex.compile(r'^(?<CeD_status>.?)(?<donor>CD\d+).*(?<sample_cat>PRE|POS|NEG)', regex.IGNORECASE),
    ctrl_date = regex.compile(r'^(?<date>\d+) (?<donor>BC11)'),
    ctrl = regex.compile(r'^(?<donor>BC\d+)'),
    flu = regex.compile(r'^(?<donor>[^ ]+) Flu.*(?<sample_cat>Phenotype|All)', regex.IGNORECASE)
)

assert pat['cdonor'].match('UCD1423P.A Tetramer POS.fcs')

def find_match(patterns, filename):
    for pat_name, p in patterns.items():
        m = p.search(filename)
        if m is not None:
            return (pat_name, m)
    else:
        raise LookupError(filename)
        
def generate_entry(p, m):
    def parse_cdonor(d):
        return dict(
            ced = ced_status[d['CeD_status']],
            donor = d['donor'],
            category = sample_category[d['sample_cat'].lower()]
        )
    
    def parse_flu(d):
        return dict(
            ced = 'Flu',
            donor = d['donor'],
            category = sample_category[d['sample_cat'].lower()]
        )
    
    
    def parse_cdonor(d):
        return dict(
            ced = ced_status[d['CeD_status']],
            donor = d['donor'],
            category = sample_category[d['sample_cat'].lower()]
        )
    
    def parse_ctrl(d):
        return dict(
            ced = 'control',
            donor = d['donor']
        )
    def parse_date_ctrl(d):
        return dict(
            ced = 'control',
            donor = d['donor'],
            date = d['date'],
            extra_info = 'sample is PBMC date ctrl'
        )
    default_d = dict(
        date = 'Not specified',
        category = sample_category['pre']
    )
    if p == 'cdonor':
        d=  parse_cdonor(m.groupdict())
    elif p == 'ctrl':
        d = parse_ctrl(m.groupdict())
    elif p == 'ctrl_date':
        d = parse_date_ctrl(m.groupdict())
    elif p == 'flu':
        d = parse_flu(m.groupdict())
    default_d.update(d)
    return default_d

def parse_filenames(xs):
    extra_matches = {
        'CD1222 DM type1 All CD4': dict(
            ced = 'T1DB',
            category = 'full',
            donor = 'CD1222'
        ),
        'CD1222 DM type1 CD Phenotype': dict(
            ced = 'T1DB',
            category = auto_phenotype,
            donor = 'CD1222'
        ),
        'BBS_3 All CD4': dict(
            ced = 'SSC',
            category = 'full',
            donor = 'BBS_3'
        ),
        'BBS_3 CD4 Phenotype': dict(
            ced = 'SSC',
            category = auto_phenotype,
            donor = 'BBS_3'
        ),
        'BBS_1189 All CD4': dict(
            ced = 'SLE',
            category = 'full',
            donor = 'BBS_1189'
        ),
        'Control_BCN11 All CD4': dict(
            ced = 'control',
            category = 'full',
            donor = 'BC11'
        )
    }
    for v in extra_matches.values():
        v['date'] = 'not specified'
        v['biosource'] = 'PBMC'
    for x in xs:
        fname = x['filename']
        try:
            pattern, m = find_match(pat, fname)
        except LookupError:
            if fname in extra_matches:
                d = dict(extra_matches[fname])
            else:
                print('Unable to parse:', fname)
                continue
        else:            
            d = generate_entry(pattern, m)
        d.update(x)
        if d.get('extra_info', False) == 'sample is PBMC date ctrl':
            d['biosource'] = 'PBMC'
        yield d

Parsing filenames should not give parse errors. Check entries if that is the case

In [33]:
parsed_filenames = list(parse_filenames(sm))
#print(parsed_filenames[0])
# for x in [x['filename'] for x in parsed_filenames if 'CD1535' == x['donor']]:
#    print(x)

In [34]:
with open('../input_data/original/sample_meta.csv') as f:
    sample_metas = list(csv.DictReader(f))

In [35]:
from copy import copy
class NoMatchError(LookupError):
    pass
class MultipleMatchesError(LookupError):
    pass


def find_match(parsed_filename, sample_meta):
    donor_xs = [x for x in sample_meta if parsed_filename['donor'] in x['sample']]
    bio_xs = [x for x in donor_xs if parsed_filename['biosource'] == x['biosource']]
    xs = bio_xs
    if len(xs) == 0:
        print('No match for {donor} | {biosource} | {filename}'.format(**parsed_filename))
        print('DONOR_XS:')
        print(donor_xs)
        print('BIO_XS:')
        pprint(bio_xs)
        print()
        raise NoMatchError(f'No match for {parsed_filename}')
    elif len(xs) > 1 and len([x for x in xs if parsed_filename['ced'] == x['Disease Status']]) == 1:
        xs = [x for x in xs if parsed_filename['ced'] == x['Disease Status']]
    elif len(xs) > 1:
        print(f'Multiple matches for {parsed_filename}')
        pprint(xs)
        print()
        raise MultipleMatchesError(f'Multiple matches for {parsed_filename}')
    assert len(xs) == 1
    return copy(xs[0])

def forgivable_matching(pfs, sm):
    xs = []
    for x in pfs:
        try:
            y = find_match(x, sm)
            y['filename'] = x['filename']
            y['path'] = x['path']
            y['donor'] = x['donor']
            y['sample_category'] = x['category']
            xs.append(y)
        except NoMatchError or MultipleMatchesError as e:
            print('Error handling', e)
            print()
    return xs
        
    
samples_with_filenames = forgivable_matching(parsed_filenames, sample_metas)

In [36]:
[x for x in samples_with_filenames if x['donor'] == 'CD1535']

[OrderedDict([('sample', 'CCD1535'),
              ('biosource', 'PBMC'),
              ('Disease', 'Ced'),
              ('Disease Status', 'Challenge'),
              ('Instrument', 'Helios Davis'),
              ('Experimet Date', ''),
              ('Note', '{“day”: 6}'),
              ('filename', 'CCD1535_day6_tetramer_POS'),
              ('path',
               PosixPath('input_data/original/PBMC/gluten_challenge_CD1535_CD4/CCD1535_day6_tetramer_POS.fcs')),
              ('donor', 'CD1535'),
              ('sample_category', 'tetramer+')]),
 OrderedDict([('sample', 'TCD1535'),
              ('biosource', 'PBMC'),
              ('Disease', 'Ced'),
              ('Disease Status', 'GFD'),
              ('Instrument', 'Helios Davis'),
              ('Experimet Date', ''),
              ('Note', ''),
              ('filename', 'TCD1535_tetramer_NEG'),
              ('path',
               PosixPath('input_data/original/PBMC/gluten_challenge_CD1535_CD4/TCD1535_tetramer_NEG.fcs')),
 

In [37]:
print(f"We got {len(parsed_filenames)} fcs files and {len(sample_metas)} samples annotated in DB. We have {len(samples_with_filenames)} matches.")

We got 91 fcs files and 90 samples annotated in DB. We have 91 matches.


# Create new sample_db that includes file names

Possibly create copies with sane filenames, but don't bother if this works

In [38]:
with open('../input_data/renamed/fcs_matched_samples.csv', 'w') as f: 
    writer = csv.DictWriter(f, samples_with_filenames[0].keys())
    writer.writeheader()
    writer.writerows(samples_with_filenames)
    