# Across-Source Drug Name Mapping
**Local Version**: 1
**Source Version**: NA

Maps drug ids/names across sources to create a global lookup table.

In [30]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
from mgds.data_aggregation import entity
from mgds.data_aggregation import data_type as dtyp
from py_utils import set_utils, assertion_utils
pd.set_option('display.max_info_rows', 50000000)
pd.set_option('display.max_colwidth', 10000)

## Load All Drug Sensitivity Data

In [31]:
sources = [src.GDSC_v2, src.NCI60_v2, src.CTD_v2, src.NCIDREAM_v1]
data_types = ['drug-sensitivity']
m_id = entity.get_raw_entities(sources, data_types, 'DRUG_|SMILE')

2016-12-08 13:15:44,953:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "drug-sensitivity"
2016-12-08 13:15:45,094:DEBUG:mgds.data_aggregation.entity: Processing source "nci60_v2", data type "drug-sensitivity"
2016-12-08 13:15:45,514:DEBUG:mgds.data_aggregation.entity: Processing source "ctd_v2", data type "drug-sensitivity"
2016-12-08 13:15:45,675:DEBUG:mgds.data_aggregation.entity: Processing source "ncidream_v1", data type "drug-sensitivity"


In [32]:
for k, m in m_id.items():
    print(k, m['drug-sensitivity'].columns)

gdsc_v2 Index(['DRUG_ID:COSMIC', 'DRUG_NAME', 'DATA_TYPE', 'SOURCE'], dtype='object')
ctd_v2 Index(['DRUG_ID:CTD', 'SMILES', 'DRUG_NAME', 'DRUG_ID:BROAD', 'DATA_TYPE',
       'SOURCE'],
      dtype='object')
nci60_v2 Index(['DRUG_NAME', 'DRUG_ID:NSC', 'SMILES', 'DRUG_ID:PUBCHEM', 'DATA_TYPE',
       'SOURCE'],
      dtype='object')
ncidream_v1 Index(['DRUG_NAME', 'DATA_TYPE', 'SOURCE'], dtype='object', name='DATA_TYPE')


In [33]:
d_nci60 = m_id[src.NCI60_v2]['drug-sensitivity']
d_nci60_dupes = d_nci60[d_nci60['DRUG_NAME'].duplicated()]['DRUG_NAME'].unique()
d_nci60_dupes

array([], dtype=object)

In [34]:
dt = m_id[src.NCI60_v2]['drug-sensitivity']
dt.head()

#d2 = m_id[src.GDSC_v2]['drug-sensitivity']

Unnamed: 0,DRUG_NAME,DRUG_ID:NSC,SMILES,DRUG_ID:PUBCHEM,DATA_TYPE,SOURCE
18650,"""ether-20""",676532,OC(=O)C1234B567B89%10B%11%12%13B8%14%15B%11%16%17B%12%18%19B59%13B16%18C2%16%19(COCC%20%21%22%23B%24%25%26B%27%28%29B%30%31%32B%27%33%34B%20%24%28B%21%33%35B%30%34%36B%31%37%38B%25%29%32B%22%26%37C%23%35%36%38C(=O)O)B3%14%17B47%10%15,,drug-sensitivity,nci60_v2
156488,(+)-.alpha.-viniferin,655524,Oc1ccc(cc1)C2Oc3cc(O)cc4C5C(Oc6cc(O)cc(C7C(Oc8cc(O)cc(C2c34)c78)c9ccc(O)cc9)c56)c%10ccc(O)cc%10,506835.0,drug-sensitivity,nci60_v2
18080,"(+)-6-bromo-3-bromomethyl-2,3,7-trichloro-7-methyl-1-octene",673502,CC(C)(Cl)C(Br)CCC(Cl)(CBr)C(=C)Cl,,drug-sensitivity,nci60_v2
27435,(+)-leucascandrolide a,727718,CO[C@H]1CC2CC(CC(CC(=O)O[C@H](C[C@@H]3CCC(C)C(C1)O3)\C=C\CC(C)C)O2)OC(=O)\C=C/CCc4coc(\C=C/CNC(=O)OC)n4,48427463.0,drug-sensitivity,nci60_v2
28875,(+)-membrenone b,740825,CCC(=O)O[C@@H]([C@H](C)[C@H]1OC(=C(C)C(=O)[C@@H]1C)CC)[C@@H](C)C(=O)CC,91146000.0,drug-sensitivity,nci60_v2


In [35]:
dt[dt['DRUG_NAME'].isin(['7-Ethyl-10-hydroxycamptothecin', 'Camptothecin'])]

Unnamed: 0,DRUG_NAME,DRUG_ID:NSC,SMILES,DRUG_ID:PUBCHEM,DATA_TYPE,SOURCE
18087,7-Ethyl-10-hydroxycamptothecin,673596,CCc1c2CN3C(=O)C4=C(C=C3c2nc5ccc(O)cc15)[C@@](O)(CC)C(=O)OC4,515301.0,drug-sensitivity,nci60_v2
5197,Camptothecin,100880,[Na+].CC[C@](O)(C(=O)O)C1=C(CO)C(=O)N2Cc3cc4ccccc4nc3C2=C1,301160.0,drug-sensitivity,nci60_v2


In [36]:
#set_utils.analyze_sets(d1['DRUG_ID:PUBCHEM'].dropna().astype(np.int64).unique(), d2['DRUG_ID:COSMIC'].unique())

## Load Drug Name Mappings

In [37]:
def clean_drug_name(x):
    return entity.remove_non_alphanum(x).upper().strip()

d_synonym = db.load(src.GDSC_v2, db.IMPORT, 'drug-synonyms')
d_synonym['SYNONYM'] = d_synonym['SYNONYM'].apply(clean_drug_name)
d_synonym['DRUG_NAME'] = d_synonym['DRUG_NAME'].apply(clean_drug_name)
d_synonym.head()

Unnamed: 0,SYNONYM,DRUG_NAME
0,11DEOXOJERVINE,CYCLOPAMINE
1,25BENZO13DIOXOL5YL2TERTBUTYL3HIMIDAZOL4YL6METHYLPYRIDINEHYDROCHLORIDEHYDRATE,SB505124
2,3PHENYLN222TRICHLORO18QUINOLINYLAMINOTHIOXOMETHYLAMINOETHYL2PROPENAMIDE,SALUBRINAL
3,4BUTANOYLOXYMETHYLPHENYL2E4E6E8E37DIMETHYL9266TRIMETHYLCYCLOHEX1ENYLNONA2468TETRAENOATE,VNLG124
4,5FU,5FLUOROURACIL


In [38]:
import re
regex = re.compile('[\W_]+')

m_synonym = d_synonym.set_index('SYNONYM')['DRUG_NAME'].to_dict()

def default_id(v_clean):
    # If clean drug name has a known synonym, return the drug name
    # for this known mapping immediately
    if v_clean in m_synonym:
        return m_synonym[v_clean]
    
    # Otherwise, return the cleaned drug name as the ID
    return v_clean

def to_mgds_id(r):
    source = r['SOURCE']
    v = r['DRUG_NAME']
    
    fn = None
    if pd.isnull(v):
        return None
    
    v_clean = clean_drug_name(v)
    
    # NCI-60 Special Cases
    if source == src.NCI60_v2:
        if v == '(+)-6-bromo-3-bromomethyl-2,3,7-trichloro-7-methyl-1-octene':
            return '6BROMO3BROMOMETHYL237TRICHLORO7METHYL1OCTENEPLUS'
        if v == '(-)-6-bromo-3-bromomethyl-2,3,7-trichloro-7-methyl-1-octene':
            return '6BROMO3BROMOMETHYL237TRICHLORO7METHYL1OCTENEMINUS'
        if v == '(-)-avarol':
            return 'AVAROLMINUS'
        if v == 'avarol':
            return 'AVAROL'
        if v == 'crassin acetate':
            return 'CRASSINACETATE1'
        if v == 'crassin, acetate':
            return 'CRASSINACETATE2'
        if v == 'd.b.t.c.':
            return 'DBTC1'
        if v == 'dbtc':
            return 'DBTC2'
        if v == 'imidazole-5-one deri':
            return 'IMIDAZOLE5ONEDERI1'
        if v == 'imidazole-5-one deri.':
            return 'IMIDAZOLE5ONEDERI2'
        if v == '(+)-leucascandrolide a':
            return 'LEUCASCANDROLIDEAPLUS'
        if v == '(-)-leucascandrolide a':
            return 'LEUCASCANDROLIDEAMINUS'
        if v == '(z) 4-acetoxy-(3,4\',5)-trimethoxystilbene':
            return 'Z4ACETOXY345TRIMETHOXYSTILBENE1'
        if v == '(z) 4-acetoxy-3\',4\',5\'-trimethoxystilbene':
            return 'Z4ACETOXY345TRIMETHOXYSTILBENE2'
        
        
        return default_id(v_clean)
    
    # GDSC/COSMIC Special Cases
    if source == src.GDSC_v2:
        return default_id(v_clean)
    
    # CCLE Special Cases
    if source == src.CTD_v2:
        return default_id(v_clean)
    
    if source == src.NCIDREAM_v1:
        return default_id(v_clean)

    raise ValueError('Cell line normalization for source "{}" has not been implemented yet'.format(source))

def aggregate(m_id):
    r = []
    for source in m_id.keys():
        for data_type, d in m_id[source].items():
            c_cl = d.filter(regex='DRUG_').columns.tolist()
            #d_id = d.copy().assign(**{'DRUG_NAME:MGDS': to_mgds_ids(source, d['DRUG_NAME'])})
            d_id = d.copy()
            d_id['DRUG_NAME:MGDS'] = d_id.apply(to_mgds_id, axis=1)
            for c in c_cl:
                taxonomy = c.split(':')[1] if ':' in c else 'COMMON'
                d_pt = d_id.assign(TAXONOMY=taxonomy)[['DRUG_NAME:MGDS', 'DATA_TYPE', 'SOURCE', 'TAXONOMY', c]]
                d_pt = d_pt.rename(columns={c: 'DRUG_NAME'})
                r.append(d_pt)
    r = pd.concat(r).reset_index(drop=True)
    assert np.all(r['DATA_TYPE'] == dtyp.DRUG_SENSITIVITY)
    return r.drop('DATA_TYPE', axis=1)

In [39]:
d_id = aggregate(m_id)

In [40]:
d_id.head()

DATA_TYPE,DRUG_NAME:MGDS,SOURCE,TAXONOMY,DRUG_NAME
0,17AAG,gdsc_v2,COSMIC,1026
1,VX702,gdsc_v2,COSMIC,1028
2,AMG706,gdsc_v2,COSMIC,1029
3,KU55933,gdsc_v2,COSMIC,1030
4,ELESCLOMOL,gdsc_v2,COSMIC,1031


In [41]:
# This should return no results when there are no duplicates
cts = d_id.groupby(['SOURCE', 'TAXONOMY', 'DRUG_NAME:MGDS']).size()
print(cts.value_counts())
cts[cts > 1]

1    14795
dtype: int64


Series([], dtype: int64)

In [42]:
d_id[d_id['DRUG_NAME:MGDS'] == 'CAMPTOTHECIN']

DATA_TYPE,DRUG_NAME:MGDS,SOURCE,TAXONOMY,DRUG_NAME
189,CAMPTOTHECIN,gdsc_v2,COSMIC,1003
440,CAMPTOTHECIN,gdsc_v2,COMMON,Camptothecin
3433,CAMPTOTHECIN,nci60_v2,COMMON,Camptothecin
7643,CAMPTOTHECIN,nci60_v2,NSC,100880
11853,CAMPTOTHECIN,nci60_v2,PUBCHEM,301160


In [43]:
#d_id[(d_id['SOURCE'] == src.NCI60_v2) & (d_id['DRUG_NAME:MGDS'] == 'Z4ACETOXY345TRIMETHOXYSTILBENE')]

In [44]:
def singlestr(x):
    assert len(x) == 1
    return x.iloc[0]
d_id_m = d_id.pivot_table(index='DRUG_NAME:MGDS', columns=['TAXONOMY', 'SOURCE'], values='DRUG_NAME', aggfunc=singlestr)
d_id_m.head()

TAXONOMY,BROAD,COMMON,COMMON,COMMON,COMMON,COSMIC,CTD,NSC,PUBCHEM
SOURCE,ctd_v2,ctd_v2,gdsc_v2,nci60_v2,ncidream_v1,gdsc_v2,ctd_v2,nci60_v2,nci60_v2
DRUG_NAME:MGDS,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
010200,,,,010200,,,,529108,482905.0
1011METHYLENEDIOXY20RSCAMPTOTHECIN,,,,"10,11-methylenedioxy-20(rs)-camptothecin",,,,606174,580568.0
1011METHYLENEDIOXYCAMPTOTHECIN,,,,"10,11-methylenedioxycamptothecin",,,,634724,497235.0
1024DICHLOROPHENYL3METHYLFLAVIN,,,,"10-(2',4'-dichlorophenyl)-3-methylflavin",,,,625537,492471.0
106UBIQUINOLYLDECYLTRIPHENYLPHOSPHONIUM,,,,10-(6'-ubiquinolyl)decyltriphenylphosphonium,,,,745025,91147600.0


In [45]:
d_id_m.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4850 entries, 010200 to ZYGOSPORINA
Data columns (total 9 columns):
(BROAD, ctd_v2)          545 non-null object
(COMMON, ctd_v2)         545 non-null object
(COMMON, gdsc_v2)        251 non-null object
(COMMON, nci60_v2)       4210 non-null object
(COMMON, ncidream_v1)    28 non-null object
(COSMIC, gdsc_v2)        251 non-null object
(CTD, ctd_v2)            545 non-null object
(NSC, nci60_v2)          4210 non-null object
(PUBCHEM, nci60_v2)      3708 non-null object
dtypes: object(9)
memory usage: 378.9+ KB


In [46]:
dt = d_id_m['COMMON']

In [47]:
tgt_drug = ['Navitoclax', 'Nutlin', 'AG-014699', 'PD-0332991', 'PLX4720', 'SB590885']
tgt_drug = [v.upper() for v in tgt_drug]
dt[dt.applymap(lambda v: False if pd.isnull(v) else v.upper() in tgt_drug).any(axis=1)]

SOURCE,ctd_v2,gdsc_v2,nci60_v2,ncidream_v1
DRUG_NAME:MGDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AG014699,,AG-014699,,
NAVITOCLAX,navitoclax,Navitoclax,,
PD0332991,,PD-0332991,,
PLX4720,PLX-4720,PLX4720,,
SB590885,,SB590885,,


In [48]:
dt[dt.notnull().sum(axis=1) > 1].info()

<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, 17AAG to ZSTK474
Data columns (total 4 columns):
ctd_v2         128 non-null object
gdsc_v2        103 non-null object
nci60_v2       90 non-null object
ncidream_v1    8 non-null object
dtypes: object(4)
memory usage: 5.7+ KB


## Export

In [49]:
d_id_m.head()

TAXONOMY,BROAD,COMMON,COMMON,COMMON,COMMON,COSMIC,CTD,NSC,PUBCHEM
SOURCE,ctd_v2,ctd_v2,gdsc_v2,nci60_v2,ncidream_v1,gdsc_v2,ctd_v2,nci60_v2,nci60_v2
DRUG_NAME:MGDS,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
010200,,,,010200,,,,529108,482905.0
1011METHYLENEDIOXY20RSCAMPTOTHECIN,,,,"10,11-methylenedioxy-20(rs)-camptothecin",,,,606174,580568.0
1011METHYLENEDIOXYCAMPTOTHECIN,,,,"10,11-methylenedioxycamptothecin",,,,634724,497235.0
1024DICHLOROPHENYL3METHYLFLAVIN,,,,"10-(2',4'-dichlorophenyl)-3-methylflavin",,,,625537,492471.0
106UBIQUINOLYLDECYLTRIPHENYLPHOSPHONIUM,,,,10-(6'-ubiquinolyl)decyltriphenylphosphonium,,,,745025,91147600.0


In [50]:
db.save(d_id_m, src.MGDS_v1, db.ENTITY, 'drug-ids')

'/Users/eczech/data/research/mgds/entity/mgds_v1_drug-ids.pkl'