In [43]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
from mgds.data_aggregation import entity
from py_utils import set_utils, assertion_utils
pd.set_option('display.max_info_rows', 50000000)

In [2]:
sources = [src.GDSC_v2, src.CCLE_v1, src.NCI60_v2, src.NCIDREAM_v1]
data_types = ['gene-exome-seq', 'gene-expression', 'gene-copy-number', 'gene-rna-seq', 'gene-methylation']
m_id = entity.get_raw_entities(sources, data_types, 'CELL_LINE_ID')

2016-11-28 12:28:13,904:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-exome-seq"
2016-11-28 12:28:14,259:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-expression"
2016-11-28 12:28:21,848:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-copy-number"
2016-11-28 12:28:52,504:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-rna-seq"
2016-11-28 12:28:52,508:INFO:mgds.data_aggregation.entity: Data for source "gdsc_v2" and data type "gene-rna-seq" does not exist
2016-11-28 12:28:52,508:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-methylation"
2016-11-28 12:28:52,509:INFO:mgds.data_aggregation.entity: Data for source "gdsc_v2" and data type "gene-methylation" does not exist
2016-11-28 12:28:52,509:DEBUG:mgds.data_aggregation.entity: Processing source "ccle_v1", data type "gene-exome-seq"
2016-11-28 12:28:52,578:DEBUG:mgds.data

In [3]:
# dt = m_id[src.CCLE_v1]['gene-expression']
# dt[dt['CELL_LINE_ID'].str.contains('TT')]

In [3]:
import re
regex = re.compile('[\W_]+')
def clean(x):
    return regex.sub('', x)

def to_mgds_id(source, v):
    fn = None
    if pd.isnull(v):
        return None
    
    # NCI-60 Special Cases
    if source == src.NCI60_v2:
        return clean(v).upper().strip()
    
    # GDSC/COSMIC Special Cases
    if source == src.GDSC_v2:
        v = v.upper().strip()
        if v == 'T-T':
            return 'TTOESOPHAGUS'
        if v == 'TT':
            return 'TTTHYROID'
        if v == 'KM-H2':
            return 'KMH2BLOOD'
        if v == 'KMH-2':
            return 'KMH2THYR'
        if v == '786-0':
            return '786O'
        if v == 'EOL-1-CELL':
            return 'EOL1'
        return clean(v).strip()
    
    # CCLE Special Cases
    if source == src.CCLE_v1:
        v = v.upper().strip()
        if v == 'TT_OESOPHAGUS':
            return 'TTOESOPHAGUS'
        if v == 'TT_THYROID':
            return 'TTTHYROID'
        return v.split('_')[0].strip()
    
    if source == src.NCIDREAM_v1:
        return clean(v).strip()

    raise ValueError('Cell line normalization for source "{}" has not been implemented yet'.format(source))

def to_mgds_ids(source, ids):
    return [to_mgds_id(source, v) for v in ids]


def aggregate(m_id):
    r = []
    for source in m_id.keys():
        for data_type, d in m_id[source].items():
            c_cl = d.filter(regex='CELL_LINE_ID').columns.tolist()
            d_id = d.copy().assign(MGDS_ID=to_mgds_ids(source, d['CELL_LINE_ID']))
            for c in c_cl:
                taxonomy = c.split(':')[1] if ':' in c else 'COMMON'
                d_pt = d_id.assign(TAXONOMY=taxonomy)[['MGDS_ID', 'DATA_TYPE', 'SOURCE', 'TAXONOMY', c]]
                d_pt = d_pt.rename(columns={c: 'CELL_LINE_ID'})
                r.append(d_pt)
    return pd.concat(r).reset_index(drop=True)

In [4]:
d_id = aggregate(m_id)

In [5]:
d_id.head()

Unnamed: 0,MGDS_ID,DATA_TYPE,SOURCE,TAXONOMY,CELL_LINE_ID
0,184A1,gene-exome-seq,ncidream_v1,COMMON,184A1
1,184B5,gene-exome-seq,ncidream_v1,COMMON,184B5
2,21MT1,gene-exome-seq,ncidream_v1,COMMON,21MT1
3,21NT,gene-exome-seq,ncidream_v1,COMMON,21NT
4,600MPE,gene-exome-seq,ncidream_v1,COMMON,600MPE


In [6]:
# This should return no results when there are no duplicates
cts = d_id.groupby(['SOURCE', 'DATA_TYPE', 'TAXONOMY', 'MGDS_ID']).size()
print(cts.value_counts())
cts[cts > 1]

1    9207
dtype: int64


Series([], dtype: int64)

In [7]:
# d_id[(d_id['SOURCE'] == src.GDSC_v2) & (d_id['MGDS_ID'] == 'KMH2')]

In [8]:
def singlestr(x):
    assert len(x) == 1
    return x.iloc[0]
d_id_m = d_id.pivot_table(index='MGDS_ID', columns=['TAXONOMY', 'SOURCE', 'DATA_TYPE'], values='CELL_LINE_ID', aggfunc=singlestr)
d_id_m.head()

TAXONOMY,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COSMIC,COSMIC,COSMIC
SOURCE,ccle_v1,ccle_v1,ccle_v1,gdsc_v2,gdsc_v2,gdsc_v2,nci60_v2,nci60_v2,nci60_v2,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,gdsc_v2,gdsc_v2,gdsc_v2
DATA_TYPE,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-methylation,gene-rna-seq,gene-copy-number,gene-exome-seq,gene-expression
MGDS_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3
1321N1,1321N1_CENTRAL_NERVOUS_SYSTEM,,1321N1_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,,,,,,
143B,143B_BONE,,143B_BONE,,,,,,,,,,,,,,
184A1,,,,,,,,,,,184A1,,,184A1,,,
184B5,,,,,,,,,,184B5,184B5,184B5,,184B5,,,
201T,,,,201T,201T,201T,,,,,,,,,1287381.0,1287381.0,1287381.0


In [17]:
d_id_m.sort_index().head(1400).tail(51)

TAXONOMY,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COSMIC,COSMIC,COSMIC
SOURCE,ccle_v1,ccle_v1,ccle_v1,gdsc_v2,gdsc_v2,gdsc_v2,nci60_v2,nci60_v2,nci60_v2,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,gdsc_v2,gdsc_v2,gdsc_v2
DATA_TYPE,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-methylation,gene-rna-seq,gene-copy-number,gene-exome-seq,gene-expression
MGDS_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3
TYKNU,TYKNU_OVARY,TYKNU_OVARY,TYKNU_OVARY,TYK-nu,TYK-nu,TYK-nu,,,,,,,,,909774.0,909774.0,909774.0
U031,,,,U031,U031,U031,,,,,,,,,905981.0,905981.0,905981.0
U118MG,U118MG_CENTRAL_NERVOUS_SYSTEM,,U118MG_CENTRAL_NERVOUS_SYSTEM,U-118-MG,U-118-MG,U-118-MG,,,,,,,,,687588.0,687588.0,687588.0
U138MG,U138MG_CENTRAL_NERVOUS_SYSTEM,U138MG_CENTRAL_NERVOUS_SYSTEM,U138MG_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,,,,,,
U178,U178_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,,,,,,,,
U251,,,,U251,U251,U251,,U251,,,,,,,905983.0,905983.0,905983.0
U251MG,U251MG_CENTRAL_NERVOUS_SYSTEM,,U251MG_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,,,,,,
U266,,,,U-266,U-266,U-266,,,,,,,,,753615.0,753615.0,753615.0
U266B1,U266B1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,U266B1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,U266B1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,,,,,,,,,,,,
U2OS,U2OS_BONE,U2OS_BONE,U2OS_BONE,U-2-OS,U-2-OS,U-2-OS,,,,,,,,,909776.0,909776.0,909776.0


In [15]:
d_id_m.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1412 entries, 1321N1 to ZR75B
Data columns (total 17 columns):
(COMMON, ccle_v1, gene-copy-number)        995 non-null object
(COMMON, ccle_v1, gene-exome-seq)          904 non-null object
(COMMON, ccle_v1, gene-expression)         967 non-null object
(COMMON, gdsc_v2, gene-copy-number)        996 non-null object
(COMMON, gdsc_v2, gene-exome-seq)          1001 non-null object
(COMMON, gdsc_v2, gene-expression)         976 non-null object
(COMMON, nci60_v2, gene-copy-number)       53 non-null object
(COMMON, nci60_v2, gene-exome-seq)         60 non-null object
(COMMON, nci60_v2, gene-expression)        54 non-null object
(COMMON, ncidream_v1, gene-copy-number)    47 non-null object
(COMMON, ncidream_v1, gene-exome-seq)      50 non-null object
(COMMON, ncidream_v1, gene-expression)     46 non-null object
(COMMON, ncidream_v1, gene-methylation)    41 non-null object
(COMMON, ncidream_v1, gene-rna-seq)        44 non-null object
(COSMIC, gdsc_v2,

In [19]:
from mgds.data_aggregation import excel_utils
excel_utils.is_excel_date(d_id['CELL_LINE_ID']).value_counts()

  return v.str.upper().str.strip().str.contains(DATE_REGEX)


False    8244
Name: CELL_LINE_ID, dtype: int64

## Questionable Matches

- **GP2D** <-> **GP5D**: ccle_v1/GP2D_LARGE_INTESTINE <-> gdsc_v2/GP5d (large intenstine site)
- **HEY** <-> **HEYA8**: ccle_v1/HEYA8_OVARY <-> gdsc_v2/Hey (ovary site)
- **U251** <-> **U251MG**: ccle_v1/U251MG_CENTRAL_NERVOUS_SYSTEM <-> gdsc_v2/U251 (central_nervous_system)
- **U266** <-> **U266B1**: ccle_v1/U266B1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE <-> gdsc_v2/U-266 (haematopoietic_and_lymphoid_tissue)
- **LU99** <-> **LU99A**: ccle_v1/LU99_LUNG <-> gdsc_v2/LU-99A



In [32]:
check_idx = []
for i in range(len(d_id_m)-1):
    v1 = d_id_m.index.values[i]
    v2 = d_id_m.index.values[i+1]
    if v1 in v2:
        check_idx.append(i)
        check_idx.append(i+1)
d_id_m.iloc[check_idx, :].head(500).tail(51)

TAXONOMY,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COSMIC,COSMIC,COSMIC
SOURCE,ccle_v1,ccle_v1,ccle_v1,gdsc_v2,gdsc_v2,gdsc_v2,nci60_v2,nci60_v2,nci60_v2,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,gdsc_v2,gdsc_v2,gdsc_v2
DATA_TYPE,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-methylation,gene-rna-seq,gene-copy-number,gene-exome-seq,gene-expression
MGDS_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3
RT11284,RT11284_URINARY_TRACT,RT11284_URINARY_TRACT,RT11284_URINARY_TRACT,,,,,,,,,,,,,,
SCC9,SCC9_UPPER_AERODIGESTIVE_TRACT,SCC9_UPPER_AERODIGESTIVE_TRACT,SCC9_UPPER_AERODIGESTIVE_TRACT,SCC-9,SCC-9,SCC-9,,,,,,,,,909709.0,909709.0,909709.0
SCC90,,,,SCC90,SCC90,SCC90,,,,,,,,,1299052.0,1299052.0,1299052.0
SKMEL2,SKMEL2_SKIN,SKMEL2_SKIN,SKMEL2_SKIN,SK-MEL-2,SK-MEL-2,SK-MEL-2,SK_MEL_2,SK_MEL_2,SK_MEL_2,,,,,,905955.0,905955.0,905955.0
SKMEL24,SKMEL24_SKIN,SKMEL24_SKIN,SKMEL24_SKIN,SK-MEL-24,SK-MEL-24,SK-MEL-24,,,,,,,,,909725.0,909725.0,909725.0
SKMEL3,SKMEL3_SKIN,SKMEL3_SKIN,SKMEL3_SKIN,SK-MEL-3,SK-MEL-3,SK-MEL-3,,,,,,,,,909724.0,909724.0,909724.0
SKMEL30,SKMEL30_SKIN,SKMEL30_SKIN,SKMEL30_SKIN,SK-MEL-30,SK-MEL-30,SK-MEL-30,,,,,,,,,909726.0,909726.0,909726.0
SKN,,,,SKN,SKN,SKN,,,,,,,,,1240215.0,1240215.0,1240215.0
SKN3,,,,SKN-3,SKN-3,SKN-3,,,,,,,,,1299059.0,1299059.0,1299059.0
SNU1,SNU1_STOMACH,SNU1_STOMACH,SNU1_STOMACH,,,,,,,,,,,,,,


## Match Analysis

In [24]:
d_id_m.loc['U266':].head(3)

TAXONOMY,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COSMIC,COSMIC,COSMIC
SOURCE,ccle_v1,ccle_v1,ccle_v1,gdsc_v2,gdsc_v2,gdsc_v2,nci60_v2,nci60_v2,nci60_v2,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,gdsc_v2,gdsc_v2,gdsc_v2
DATA_TYPE,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-methylation,gene-rna-seq,gene-copy-number,gene-exome-seq,gene-expression
MGDS_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3
U266,,,,U-266,U-266,U-266,,,,,,,,,753615.0,753615.0,753615.0
U266B1,U266B1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,U266B1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,U266B1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,,,,,,,,,,,,,,
U2OS,U2OS_BONE,U2OS_BONE,U2OS_BONE,U-2-OS,U-2-OS,U-2-OS,,,,,,,,,909776.0,909776.0,909776.0


In [19]:
d_gdsc = db.load(src.GDSC_v2, db.IMPORT, 'cellline-meta')
d_gdsc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 0 to 1033
Data columns (total 15 columns):
CELL_LINE_ID                         1029 non-null object
CELL_LINE_ID:COSMIC                  1029 non-null object
WHOLE_EXOME_SEQUENCING               1001 non-null object
COPY_NUMBER_ALTERATIONS              1001 non-null object
GENE_EXPRESSION                      1001 non-null object
METHYLATION                          1001 non-null object
DRUGRESPONSE                         1001 non-null object
GDSC_TISSUE_DESCRIPTOR_1             1001 non-null object
GDSC_TISSUE_DESCRIPTOR_2             1001 non-null object
CANCER_TYPE                          826 non-null object
MICROSATELLITE_INSTABILITY_STATUS    986 non-null object
SCREEN_MEDIUM                        1001 non-null object
GROWTH_PROPERTIES                    999 non-null object
SITE                                 1024 non-null object
HISTOLOGY                            1024 non-null object
dtypes: object(15)
memory 

In [30]:
d_gdsc[d_gdsc['CELL_LINE_ID:COSMIC'] == '907796']

Unnamed: 0,CELL_LINE_ID,CELL_LINE_ID:COSMIC,WHOLE_EXOME_SEQUENCING,COPY_NUMBER_ALTERATIONS,GENE_EXPRESSION,METHYLATION,DRUGRESPONSE,GDSC_TISSUE_DESCRIPTOR_1,GDSC_TISSUE_DESCRIPTOR_2,CANCER_TYPE,MICROSATELLITE_INSTABILITY_STATUS,SCREEN_MEDIUM,GROWTH_PROPERTIES,SITE,HISTOLOGY
518,LU-99A,907796,Y,Y,Y,Y,Y,lung_NSCLC,lung_NSCLC_large cell,,MSS/MSI-L,R,Adherent,lung,carcinoma


# Compress to Source

In [36]:
d_id.head()

Unnamed: 0,MGDS_ID,DATA_TYPE,SOURCE,TAXONOMY,CELL_LINE_ID
0,184A1,gene-exome-seq,ncidream_v1,COMMON,184A1
1,184B5,gene-exome-seq,ncidream_v1,COMMON,184B5
2,21MT1,gene-exome-seq,ncidream_v1,COMMON,21MT1
3,21NT,gene-exome-seq,ncidream_v1,COMMON,21NT
4,600MPE,gene-exome-seq,ncidream_v1,COMMON,600MPE


In [42]:
def get_unique_id(g):
    ids = g['CELL_LINE_ID'].astype(str)
    assert len(np.unique(ids)), 'Found group with conflicting ids across data types:\n{}'.format(g)
    return ids.iloc[0]

# Get identifier for each source + taxonomy by first making sure there are no conflicts across data types
d_id_src = d_id.groupby(['MGDS_ID', 'SOURCE', 'TAXONOMY']).apply(get_unique_id)
d_id_src = d_id_src.unstack().unstack()

# Make sure all IDs are represented as strings
assertion_utils.assert_object_types(d_id_src)

d_id_src.head()

TAXONOMY,COMMON,COMMON,COMMON,COMMON,COSMIC,COSMIC,COSMIC,COSMIC
SOURCE,ccle_v1,gdsc_v2,nci60_v2,ncidream_v1,ccle_v1,gdsc_v2,nci60_v2,ncidream_v1
MGDS_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1321N1,1321N1_CENTRAL_NERVOUS_SYSTEM,,,,,,,
143B,143B_BONE,,,,,,,
184A1,,,,184A1,,,,
184B5,,,,184B5,,,,
201T,,201T,,,,1287381.0,,


# Export

In [47]:
db.save(d_id_src, src.MGDS_v1, db.ENTITY, 'cellline-ids-by-src')

'/Users/eczech/data/research/mgds/entity/mgds_v1_cellline-ids-by-src.pkl'

In [48]:
db.save(d_id_m, src.MGDS_v1, db.ENTITY, 'cellline-ids-by-typ')

'/Users/eczech/data/research/mgds/entity/mgds_v1_cellline-ids-by-typ.pkl'