In [110]:
import pandas as pd

In [105]:
def get_annotate_map(annotate_path):
    database = pd.read_csv(annotate_path,sep='\t')
    annote_map = {}
    for _,df in database[['cell line','synonyms']].iterrows():
        map = {}
        for key in df['synonyms'].split(';'):
            map[key.strip()] = df['cell line'].strip()
        annote_map.update(map)
    return annote_map

def reannote_cell_sdrf(sdrf_path,annotate_path):
    annote_map = get_annotate_map(annotate_path)
    def check_cell_line(cell):
        nonlocal annote_map
        if cell not in annote_map:
            raise KeyError(f'{cell} not in annoted database')
        return annote_map[cell]
    sdrf = pd.read_csv(sdrf_path,sep='\t')
    if('characteristics[cell line]' not in sdrf.columns):
        raise Error('characteristics[cell line] not in sdrf columns')
    sdrf.loc[:,'characteristics[cell line]'] = sdrf['characteristics[cell line]'].apply(check_cell_line)
    if('factor value[cell line]' in sdrf.columns):
        sdrf.loc[:,'factor value[cell line]'] = sdrf['factor value[cell line]'].apply(check_cell_line)
    return sdrf

def get_unannotate_cell_lines(annotate_path,sdrf_path):
    database = pd.read_csv(annotate_path,sep='\t')
    synonyms = ';'.join(database['synonyms'].unique().tolist())
    synonyms_set = set([cell.strip() for cell in synonyms.split(';')])
    sdrf = pd.read_csv(sdrf_path,sep='\t')
    cell_set = set([cell.strip() for cell in sdrf['characteristics[organism part]'].unique()])
    return cell_set - synonyms_set


In [None]:
sdrf_fixed = reannote_cell_sdrf('PXD026581_TMT10.tsv','cl-annotations-db.tsv')

In [109]:
unannotate_cell_lines = get_unannotate_cell_lines('cl-annotations-db.tsv','PXD026581_TMT10.tsv')

{'HCC1143',
 'HCC1395',
 'HCC1419',
 'HCC1428',
 'HCC1500',
 'HCC1937',
 'HCC1954',
 'HCC70',
 'Hs578T',
 'MCF10A',
 'MCF10Arep2',
 'Old Bridge',
 'PDX1258',
 'PDXHCI002',
 'hME1'}

Index(['source name', 'characteristics[organism]',
       'characteristics[organism part]', 'characteristics[age]',
       'characteristics[developmental stage]', 'characteristics[cell type]',
       'characteristics[sex]', 'characteristics[disease]',
       'characteristics[biological replicate]', 'characteristics[individual]',
       'characteristics[enrichment process]', 'Material Type', 'assay name',
       'technology type', 'comment[data file]', 'comment[file uri]',
       'comment[technical replicate]', 'comment[fraction identifier]',
       'comment[label]', 'comment[instrument]', 'comment[dissociation method]',
       'comment[collision energy]', 'comment[precursor mass tolerance]',
       'comment[fragment mass tolerance]', 'comment[modification parameters]',
       'comment[modification parameters].1', 'comment[cleavage agent details]',
       'comment[cleavage agent details].1', 'factor value[disease]'],
      dtype='object')