In [41]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import data_type as dtyp
from mgds.data_aggregation import io_utils
from mgds.data_aggregation import api as mgds_api
from mgds.data_aggregation import entity
from py_utils import set_utils, assertion_utils
pd.set_option('display.max_info_rows', 50000000)

# Primary Site Mapping

In [50]:
sources = [src.GDSC_v2, src.CCLE_v1, src.NCI60_v2, src.NCIDREAM_v1, src.CTD_v2]

d_meta = mgds_api.get_cellline_metadata(sources)
d_meta.head()

Unnamed: 0,AGE,CELL_LINE_ID,CELL_LINE_ID:COSMIC,CELL_LINE_ID:MGDS,GENDER,PRIMARY_SITE,PROPERTIES,SOURCE
0,,A253,906794,A253,,SALIVARY_GLAND,"{'CANCER_TYPE': nan, 'HISTOLOGY': 'carcinoma',...",gdsc_v2
1,,BB30-HNC,753531,BB30HNC,,UPPER_AERODIGESTIVE_TRACT,"{'CANCER_TYPE': 'HNSC', 'HISTOLOGY': 'carcinom...",gdsc_v2
2,,BB49-HNC,753532,BB49HNC,,UPPER_AERODIGESTIVE_TRACT,"{'CANCER_TYPE': 'HNSC', 'HISTOLOGY': 'carcinom...",gdsc_v2
3,,BHY,753535,BHY,,UPPER_AERODIGESTIVE_TRACT,"{'CANCER_TYPE': 'HNSC', 'HISTOLOGY': 'carcinom...",gdsc_v2
4,,BICR10,1290724,BICR10,,UPPER_AERODIGESTIVE_TRACT,"{'CANCER_TYPE': 'HNSC', 'HISTOLOGY': 'carcinom...",gdsc_v2


In [51]:
def clean(x):
    return x.upper().strip()

def to_mgds_site(source, cell_line_id, site):
    
    # NCI60 Synonym Normalization
    if source == src.NCI60_v2:
        if site == 'RENAL':
            return 'KIDNEY'
        if site == 'NON-SMALL CELL LUNG':
            return 'LUNG'
        if site == 'LEUKEMIA':
            return 'HAEMATOPOIETIC_AND_LYMPHOID_TISSUE'
        if site == 'COLON':
            return 'LARGE_INTESTINE'
        if site == 'MELANOMA':
            return 'SKIN'
        if site == 'OVARIAN':
            return 'OVARY'
        if site == 'CENTRAL NERVOUS SYSTEM':
            return 'CENTRAL_NERVOUS_SYSTEM'
        return clean(site)
        
    # CCLE Normalization
    if source == src.CCLE_v1:
        # This is the only case that seems flat out incorrect
        if cell_line_id == 'COLO741' and site == 'SKIN':
            return 'LARGE_INTESTINE' 
        
        # Minor disambiguation
        if cell_line_id == 'RKN' and site == 'SOFT_TISSUE':
            return 'OVARY'
        return clean(site)
    
    # GDSC Omissions (but resolvable based on other sources)
    if source == src.GDSC_v2:
        if cell_line_id == 'G292CLONEA141B1':
            return 'BONE'
        if cell_line_id == 'HEP3B217':
            return 'LIVER'
        if cell_line_id == 'SUDHL8':
            return 'HAEMATOPOIETIC_AND_LYMPHOID_TISSUE'
        return clean(site)
    
    # NCI-Dream Cell Lines are all breast cancers
    if source == src.NCIDREAM_v1:
        assert site == 'BREAST'
        return site
        
    raise ValueError('Primary site resolution not yet supported for source "{}"'.format(source))

d_ps = d_meta[['CELL_LINE_ID:MGDS', 'PRIMARY_SITE', 'SOURCE']].copy()
d_ps['PRIMARY_SITE:MGDS'] = d_ps\
    .apply(lambda r: to_mgds_site(r['SOURCE'], r['CELL_LINE_ID:MGDS'], r['PRIMARY_SITE']), axis=1)
d_ps.head()

Unnamed: 0,CELL_LINE_ID:MGDS,PRIMARY_SITE,SOURCE,PRIMARY_SITE:MGDS
0,A253,SALIVARY_GLAND,gdsc_v2,SALIVARY_GLAND
1,BB30HNC,UPPER_AERODIGESTIVE_TRACT,gdsc_v2,UPPER_AERODIGESTIVE_TRACT
2,BB49HNC,UPPER_AERODIGESTIVE_TRACT,gdsc_v2,UPPER_AERODIGESTIVE_TRACT
3,BHY,UPPER_AERODIGESTIVE_TRACT,gdsc_v2,UPPER_AERODIGESTIVE_TRACT
4,BICR10,UPPER_AERODIGESTIVE_TRACT,gdsc_v2,UPPER_AERODIGESTIVE_TRACT


In [52]:
d_ps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2161 entries, 0 to 52
Data columns (total 4 columns):
CELL_LINE_ID:MGDS    2161 non-null object
PRIMARY_SITE         2161 non-null object
SOURCE               2161 non-null object
PRIMARY_SITE:MGDS    2161 non-null object
dtypes: object(4)
memory usage: 84.4+ KB


In [53]:
# This should return no results when there are no conflicting primary sites
cts = d_ps.groupby(['CELL_LINE_ID:MGDS'])['PRIMARY_SITE:MGDS'].nunique()
print(cts.value_counts())
cts[cts > 1]

1    1421
Name: PRIMARY_SITE:MGDS, dtype: int64


Series([], Name: PRIMARY_SITE:MGDS, dtype: int64)

In [54]:
d_ps[d_ps['CELL_LINE_ID:MGDS'].isin(cts[cts > 1].index.values)].sort_values('CELL_LINE_ID:MGDS')

Unnamed: 0,CELL_LINE_ID:MGDS,PRIMARY_SITE,SOURCE,PRIMARY_SITE:MGDS


In [55]:
def singlestr(x):
    assert len(x) == 1
    return x.iloc[0]
d_ps_m = d_ps.pivot_table(
    index=['CELL_LINE_ID:MGDS', 'PRIMARY_SITE:MGDS'], columns=['SOURCE'], 
    values='PRIMARY_SITE', aggfunc=singlestr
)
d_ps_m.head()

Unnamed: 0_level_0,SOURCE,ccle_v1,gdsc_v2,nci60_v2,ncidream_v1
CELL_LINE_ID:MGDS,PRIMARY_SITE:MGDS,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1321N1,CENTRAL_NERVOUS_SYSTEM,CENTRAL_NERVOUS_SYSTEM,,,
143B,BONE,BONE,,,
184A1,BREAST,,,,BREAST
184B5,BREAST,,,,BREAST
201T,LUNG,,LUNG,,


## Export

In [56]:
db.save(d_ps_m, src.MGDS_v1, db.ENTITY, 'primary-site-by-src')

'/Users/eczech/data/research/mgds/entity/mgds_v1_primary-site-by-src.pkl'