In [1]:
import os
import re

import pandas as pd
import numpy as np

import mgitools.os_helpers as os_helpers

## metadata reformatting

In [90]:
metadata_df = pd.read_csv('../data/metadata/PV_CyTOF_metadata.csv', sep=',')

print(metadata_df.shape)
metadata_df.head()

(50, 13)


Unnamed: 0,EXP_DATE,CONDITION_LIST,AGE_YEARS,JAK2_BURDEN_PERCENTAGE,SEX,SAMPLE_TYPE,SAMPLE_NAME,CD16_FAIL,TNF_FAIL_OR_ABSENT,TPO_FAIL_OR_ABSENT,DUPLICATE_PATIENT_SAMPLETYPE,SAME_COLLECTION_SAMPLETYPE_PAIRS,TRANSFORMATION_PAIRS
0,160113,"basal, TNF",47,0.0,F,NBM,N23 BM,NO,NO,YES,,,
1,160113,"basal, TNF",43,39.9,M,PVPB,638517 PB,NO,NO,YES,,10.0,
2,160113,"basal, TNF",68,92.24,M,PVPB,673286 PB,NO,NO,YES,,,
3,160727,"basal, TPO, TNF, RUX, TPO+RUX",unknown,0.0,unknown,NPB,LRS1 NPB,NO,NO,NO,,,
4,160727,"basal, TPO, TNF, RUX, TPO+RUX",30,0.0,F,NBM,N24 BM,NO,NO,NO,,,


make sample ids where the id is in the format samplename_sampletype_expdate_condition

In [91]:
np.asarray(data)

array([[160113, 'basal, TNF', '47', ..., nan, nan, nan],
       [160113, 'basal, TNF', '47', ..., nan, nan, nan],
       [160113, 'basal, TNF', '43', ..., nan, 10.0, nan],
       ...,
       [190726, 'basal, TPO, TNF', '41', ..., nan, nan, nan],
       [190726, 'basal, TPO, TNF', '41', ..., nan, nan, nan],
       [190726, 'basal, TPO, TNF', '41', ..., nan, nan, nan]],
      dtype=object)

In [92]:
metadata_df['SAMPLE_NAME'] = [x.replace(' ', '-').replace('.', '-').replace('_', '-')
            for x in metadata_df['SAMPLE_NAME']]

data = []
sample_ids = []
for i, row in metadata_df.iterrows():
#     row.values
    conditions = row['CONDITION_LIST']
    for condition in re.split(r',|, *', conditions.strip()):
        condition = condition.strip()
        data.append(row.values)
        sample_name, sample_type, exp_date = (row['SAMPLE_NAME'], row['SAMPLE_TYPE'], row['EXP_DATE'])
        sample_ids.append(f'{sample_name}_{sample_type}_{exp_date}_{condition}'.replace(' ', '-'))
        
data = np.asarray(data)
data[data == 'unknown'] = np.nan

df = pd.DataFrame(data=np.asarray(data), index=sample_ids, columns=metadata_df.columns)
df.columns = [c if c != 'CONDITION_LIST' else 'CONDITION' for c in df.columns]
df['CONDITION'] = [x.split('_')[-1] for x in df.index]
df['EXP_DATE'] = [str(x) for x in df['EXP_DATE']]

# ## normalize sample names
# sample_name_to_possibles = {x.replace(' ', '-').replace('.', '-').replace('_', '-'):set() for x in df['SAMPLE_NAME']}
# for sample_name in df['SAMPLE_NAME']:
#     orig = sample_name
#     sample_name = sample_name.replace(' ', '-').replace('.', '-').replace('_', '-')
    
#     possibles = [orig]
#     possibles.append(sample_name.replace('-', ' '))
#     possibles.append(sample_name.replace('-', '_'))
    
#     ## also shuffle
#     pieces = sample_name.split('-')
#     possibles.append(' '.join(pieces[-1:] + pieces[:-1]))
#     possibles.append('-'.join(pieces[-1:] + pieces[:-1]))
#     possibles.append('_'.join(pieces[-1:] + pieces[:-1]))
    
    
#     sample_name_to_possibles[sample_name].update(possibles)

df.head()

Unnamed: 0,EXP_DATE,CONDITION,AGE_YEARS,JAK2_BURDEN_PERCENTAGE,SEX,SAMPLE_TYPE,SAMPLE_NAME,CD16_FAIL,TNF_FAIL_OR_ABSENT,TPO_FAIL_OR_ABSENT,DUPLICATE_PATIENT_SAMPLETYPE,SAME_COLLECTION_SAMPLETYPE_PAIRS,TRANSFORMATION_PAIRS
N23-BM_NBM_160113_basal,160113,basal,47,0.0,F,NBM,N23-BM,NO,NO,YES,,,
N23-BM_NBM_160113_TNF,160113,TNF,47,0.0,F,NBM,N23-BM,NO,NO,YES,,,
638517-PB_PVPB_160113_basal,160113,basal,43,39.9,M,PVPB,638517-PB,NO,NO,YES,,10.0,
638517-PB_PVPB_160113_TNF,160113,TNF,43,39.9,M,PVPB,638517-PB,NO,NO,YES,,10.0,
673286-PB_PVPB_160113_basal,160113,basal,68,92.24,M,PVPB,673286-PB,NO,NO,YES,,,


In [73]:
# sample_name_to_possibles

## map to cytof .fcs files

In [93]:
fdc_fps = sorted(os_helpers.listfiles('/Users/estorrs/Documents/steveoh/lineage_analysis/data/cytof/', regex='.fcs'))
len(fdc_fps), fdc_fps[:5]

(10,
 ['/Users/estorrs/Documents/steveoh/lineage_analysis/data/cytof/190621 JF Oh Human Myeloid N39-PB TNF.fcs',
  '/Users/estorrs/Documents/steveoh/lineage_analysis/data/cytof/190712 JF Oh Human Myeloid_N37-PB basal.fcs',
  '/Users/estorrs/Documents/steveoh/lineage_analysis/data/cytof/190712 JF Oh Human Myeloid_N38-PB basal.fcs',
  '/Users/estorrs/Documents/steveoh/lineage_analysis/data/cytof/190726 JF Oh Human Myeloid_336362-PB TPO.fcs',
  '/Users/estorrs/Documents/steveoh/lineage_analysis/data/cytof/190726 JF Oh Human Myeloid_646192-BM TNF.fcs'])

In [94]:
def get_fp_data(fcs_fp):
    """returns a tup with the following positions: exp_date, condition, sample_name, condition"""
    fp = fcs_fp.split('/')[-1]
    
    exp_date = re.sub(r'^([0-9]+).*$', r'\1', fp)
    condition = re.sub(r'^.*[ |_](.*)\.fcs', r'\1', fp)
    sample_name = re.sub(r'^.*Human Myeloid[ |_](.+) .*$', r'\1', fp)
    
    return exp_date, condition, sample_name

In [95]:
tup_to_filepath = {}
for fp in fdc_fps:
    exp_date, condition, sample_name = get_fp_data(fp)
    tup_to_filepath[(exp_date, condition, sample_name)] = fp
#     print()
#     print((exp_date, condition, sample_name))

## add fp column to metadata
filepaths = []
for i, row in df.iterrows():
#     print((row['EXP_DATE'], row['CONDITION'], row['SAMPLE_NAME']))
#     print(tup_to_filepath.get((row['EXP_DATE'], row['CONDITION'], row['SAMPLE_NAME']), np.nan))
    filepaths.append(tup_to_filepath.get((row['EXP_DATE'], row['CONDITION'], row['SAMPLE_NAME']), np.nan))

df['filepath'] = filepaths
        
df

Unnamed: 0,EXP_DATE,CONDITION,AGE_YEARS,JAK2_BURDEN_PERCENTAGE,SEX,SAMPLE_TYPE,SAMPLE_NAME,CD16_FAIL,TNF_FAIL_OR_ABSENT,TPO_FAIL_OR_ABSENT,DUPLICATE_PATIENT_SAMPLETYPE,SAME_COLLECTION_SAMPLETYPE_PAIRS,TRANSFORMATION_PAIRS,filepath
N23-BM_NBM_160113_basal,160113,basal,47,0,F,NBM,N23-BM,NO,NO,YES,,,,
N23-BM_NBM_160113_TNF,160113,TNF,47,0,F,NBM,N23-BM,NO,NO,YES,,,,
638517-PB_PVPB_160113_basal,160113,basal,43,39.9,M,PVPB,638517-PB,NO,NO,YES,,10,,
638517-PB_PVPB_160113_TNF,160113,TNF,43,39.9,M,PVPB,638517-PB,NO,NO,YES,,10,,
673286-PB_PVPB_160113_basal,160113,basal,68,92.24,M,PVPB,673286-PB,NO,NO,YES,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646192-BM_PVBM_190726_TPO,190726,TPO,52,42,M,PVBM,646192-BM,NO,NO,NO,,,,/Users/estorrs/Documents/steveoh/lineage_analy...
646192-BM_PVBM_190726_TNF,190726,TNF,52,42,M,PVBM,646192-BM,NO,NO,NO,,,,/Users/estorrs/Documents/steveoh/lineage_analy...
N35-BM_NBM_190726_basal,190726,basal,41,0,F,NBM,N35-BM,NO,NO,NO,,,,/Users/estorrs/Documents/steveoh/lineage_analy...
N35-BM_NBM_190726_TPO,190726,TPO,41,0,F,NBM,N35-BM,NO,NO,NO,,,,/Users/estorrs/Documents/steveoh/lineage_analy...


In [96]:
df.to_csv('../data/metadata/metadata_reformatted.tsv', sep='\t', index=True, header=True)