# Across-Source Drug Name Mapping
**Local Version**: 1
**Source Version**: NA

Maps drug ids/names across sources to create a global lookup table.

In [65]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
from mgds.data_aggregation import entity
from py_utils import set_utils, assertion_utils
pd.set_option('display.max_info_rows', 50000000)
pd.set_option('display.max_colwidth', 10000)

In [56]:
sources = [src.GDSC_v2, src.NCI60_v2, src.CTD_v2]
data_types = ['drug-sensitivity']
m_id = entity.get_raw_entities(sources, data_types, 'DRUG_')

2016-11-30 15:27:06,666:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "drug-sensitivity"
2016-11-30 15:27:06,785:DEBUG:mgds.data_aggregation.entity: Processing source "nci60_v2", data type "drug-sensitivity"
2016-11-30 15:27:06,929:DEBUG:mgds.data_aggregation.entity: Processing source "ctd_v2", data type "drug-sensitivity"


In [57]:
for k, m in m_id.items():
    print(k, m['drug-sensitivity'].columns)

nci60_v2 Index(['DRUG_NAME', 'DRUG_ID:NSC', 'DRUG_ID:PUBCHEM', 'DATA_TYPE', 'SOURCE'], dtype='object')
gdsc_v2 Index(['DRUG_ID:COSMIC', 'DRUG_NAME', 'DATA_TYPE', 'SOURCE'], dtype='object')
ctd_v2 Index(['DRUG_ID:CTD', 'DRUG_ID:BROAD', 'DRUG_NAME', 'DATA_TYPE', 'SOURCE'], dtype='object')


In [63]:
dt = m_id[src.NCI60_v2]['drug-sensitivity']
dt.head()

#d2 = m_id[src.GDSC_v2]['drug-sensitivity']

Unnamed: 0,DRUG_NAME,DRUG_ID:NSC,DRUG_ID:PUBCHEM,DATA_TYPE,SOURCE
11234,"""ether-20""",676532,,drug-sensitivity,nci60_v2
114122,(+)-.alpha.-viniferin,655524,506835.0,drug-sensitivity,nci60_v2
10823,"(+)-6-bromo-3-bromomethyl-2,3,7-trichloro-7-me...",673502,,drug-sensitivity,nci60_v2
20635,(+)-JQ1,760183,,drug-sensitivity,nci60_v2
17890,(+)-leucascandrolide a,727718,48427463.0,drug-sensitivity,nci60_v2


In [51]:
#dt[dt['DRUG_ID:COSMIC'].isin(ids)].sort_values('DRUG_NAME')
names = dt[dt['DRUG_NAME'].duplicated()]['DRUG_NAME'].unique()
dt[dt['DRUG_NAME'].isin(names)].sort_values('DRUG_NAME')

Unnamed: 0,DRUG_ID:COSMIC,DRUG_NAME,DATA_TYPE,SOURCE


In [37]:
#set_utils.analyze_sets(d1['DRUG_ID:PUBCHEM'].dropna().astype(np.int64).unique(), d2['DRUG_ID:COSMIC'].unique())

In [87]:
import re
regex = re.compile('[\W_]+')

def clean(x):
    return regex.sub('', x).upper().strip()

def to_mgds_id(source, v):
    fn = None
    if pd.isnull(v):
        return None
    
    # NCI-60 Special Cases
    if source == src.NCI60_v2:
        if v == '(+)-6-bromo-3-bromomethyl-2,3,7-trichloro-7-methyl-1-octene':
            return '6BROMO3BROMOMETHYL237TRICHLORO7METHYL1OCTENEPLUS'
        if v == '(-)-6-bromo-3-bromomethyl-2,3,7-trichloro-7-methyl-1-octene':
            return '6BROMO3BROMOMETHYL237TRICHLORO7METHYL1OCTENEMINUS'
        if v == '(-)-avarol':
            return 'MINUSAVAROL'
        if v == 'avarol':
            return 'AVAROL'
        if v == 'crassin acetate':
            return 'CRASSINACETATE1'
        if v == 'crassin, acetate':
            return 'CRASSINACETATE2'
        if v == 'd.b.t.c.':
            return 'DBTC1'
        if v == 'dbtc':
            return 'DBTC2'
        if v == 'imidazole-5-one deri':
            return 'IMIDAZOLE5ONEDERI1'
        if v == 'imidazole-5-one deri.':
            return 'IMIDAZOLE5ONEDERI2'
        if v == '(+)-leucascandrolide a':
            return 'LEUCASCANDROLIDEAPLUS'
        if v == '(-)-leucascandrolide a':
            return 'LEUCASCANDROLIDEAMINUS'
        if v == '(z) 4-acetoxy-(3,4\',5)-trimethoxystilbene':
            return 'Z4ACETOXY345TRIMETHOXYSTILBENE1'
        if v == '	(z) 4-acetoxy-3\',4\',5\'-trimethoxystilbene':
            return 'Z4ACETOXY345TRIMETHOXYSTILBENE2'
        return clean(v)
    
    # GDSC/COSMIC Special Cases
    if source == src.GDSC_v2:
        return clean(v)
    
    # CCLE Special Cases
    if source == src.CTD_v2:
        return clean(v)
    
    if source == src.NCIDREAM_v1:
        return clean(v)

    raise ValueError('Cell line normalization for source "{}" has not been implemented yet'.format(source))

def to_mgds_ids(source, ids):
    return [to_mgds_id(source, v) for v in ids]


def aggregate(m_id):
    r = []
    for source in m_id.keys():
        for data_type, d in m_id[source].items():
            c_cl = d.filter(regex='DRUG_').columns.tolist()
            d_id = d.copy().assign(MGDS_ID=to_mgds_ids(source, d['DRUG_NAME']))
            for c in c_cl:
                taxonomy = c.split(':')[1] if ':' in c else 'COMMON'
                d_pt = d_id.assign(TAXONOMY=taxonomy)[['MGDS_ID', 'DATA_TYPE', 'SOURCE', 'TAXONOMY', c]]
                d_pt = d_pt.rename(columns={c: 'DRUG_ID'})
                r.append(d_pt)
    return pd.concat(r).reset_index(drop=True)

In [88]:
d_id = aggregate(m_id)

In [89]:
d_id.head()

Unnamed: 0,MGDS_ID,DATA_TYPE,SOURCE,TAXONOMY,DRUG_ID
0,ETHER20,drug-sensitivity,nci60_v2,COMMON,"""ether-20"""
1,ALPHAVINIFERIN,drug-sensitivity,nci60_v2,COMMON,(+)-.alpha.-viniferin
2,6BROMO3BROMOMETHYL237TRICHLORO7METHYL1OCTENEPLUS,drug-sensitivity,nci60_v2,COMMON,"(+)-6-bromo-3-bromomethyl-2,3,7-trichloro-7-methyl-1-octene"
3,JQ1,drug-sensitivity,nci60_v2,COMMON,(+)-JQ1
4,LEUCASCANDROLIDEAPLUS,drug-sensitivity,nci60_v2,COMMON,(+)-leucascandrolide a


In [90]:
# This should return no results when there are no duplicates
cts = d_id.groupby(['SOURCE', 'DATA_TYPE', 'TAXONOMY', 'MGDS_ID']).size()
print(cts.value_counts())
cts[cts > 1]

1    16174
dtype: int64


Series([], dtype: int64)

In [91]:
d_id[(d_id['SOURCE'] == src.NCI60_v2) & (d_id['MGDS_ID'] == 'Z4ACETOXY345TRIMETHOXYSTILBENE')]

Unnamed: 0,MGDS_ID,DATA_TYPE,SOURCE,TAXONOMY,DRUG_ID
127,Z4ACETOXY345TRIMETHOXYSTILBENE,drug-sensitivity,nci60_v2,COMMON,"(z) 4-acetoxy-3',4',5'-trimethoxystilbene"
4806,Z4ACETOXY345TRIMETHOXYSTILBENE,drug-sensitivity,nci60_v2,NSC,638497
9485,Z4ACETOXY345TRIMETHOXYSTILBENE,drug-sensitivity,nci60_v2,PUBCHEM,499339


In [92]:
def singlestr(x):
    assert len(x) == 1
    return x.iloc[0]
d_id_m = d_id.pivot_table(index='MGDS_ID', columns=['TAXONOMY', 'SOURCE', 'DATA_TYPE'], values='DRUG_ID', aggfunc=singlestr)
d_id_m.head()

TAXONOMY,BROAD,COMMON,COMMON,COMMON,COSMIC,CTD,NSC,PUBCHEM
SOURCE,ctd_v2,ctd_v2,gdsc_v2,nci60_v2,gdsc_v2,ctd_v2,nci60_v2,nci60_v2
DATA_TYPE,drug-sensitivity,drug-sensitivity,drug-sensitivity,drug-sensitivity,drug-sensitivity,drug-sensitivity,drug-sensitivity,drug-sensitivity
MGDS_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
010200,,,,010200,,,529108,482905.0
1011METHYLENEDIOXY20RSCAMPTOTHECIN,,,,"10,11-methylenedioxy-20(rs)-camptothecin",,,606174,580568.0
1011METHYLENEDIOXYCAMPTOTHECIN,,,,"10,11-methylenedioxycamptothecin",,,634724,497235.0
1024DICHLOROPHENYL3METHYLFLAVIN,,,,"10-(2',4'-dichlorophenyl)-3-methylflavin",,,625537,492471.0
106UBIQUINOLYLDECYLTRIPHENYLPHOSPHONIUM,,,,10-(6'-ubiquinolyl)decyltriphenylphosphonium,,,745025,91147600.0


In [98]:
d_id_m['COMMON'].sort_index().head(250).tail(51)

SOURCE,ctd_v2,gdsc_v2,nci60_v2
DATA_TYPE,drug-sensitivity,drug-sensitivity,drug-sensitivity
MGDS_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
18NAPHTHYRIDIN41HONE23CHLOROPHENYL6METHYL,,,"1,8-naphthyridin-4(1h)-one, 2-(3-chlorophenyl)-6-methyl-"
18NAPHTHYRIDIN41HONE23CHLOROPHENYL7METHYL,,,"1,8-naphthyridin-4(1h)-one, 2-(3-chlorophenyl)-7-methyl-"
18NAPHTHYRIDIN41HONE23METHOXYPHENYL7METHYL,,,"1,8-naphthyridin-4(1h)-one, 2-(3-methoxyphenyl)-7-methyl-"
18NAPHTHYRIDIN41HONE24FLUOROPHENYL5METHYL,,,"1,8-naphthyridin-4(1h)-one, 2-(4-fluorophenyl)-5-methyl-"
18NAPHTHYRIDIN41HONE24FLUOROPHENYL6METHYL,,,"1,8-naphthyridin-4(1h)-one, 2-(4-fluorophenyl)-6-methyl-"
18NAPHTHYRIDIN41HONE2PHENYL,,,"1,8-naphthyridin-4(1h)-one, 2-phenyl-"
18NAPHTHYRIDIN41HONE6METHYL2PHENYL,,,"1,8-naphthyridin-4(1h)-one, 6-methyl-2-phenyl-"
18OCTANEDIAMINENNDI9ACRIDINYL9CI,,,"1,8-octanediamine, n,n'-di-9-acridinyl- (9ci)"
19FORMYLGELDANAMYCINNPIPERIDINOIMINE,,,19-formylgeldanamycin n-piperidinoimine
19FORMYLGELDANAMYCINTERTBUTYLIMINE,,,19-formylgeldanamycin tert-butylimine
