In [41]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
from mgds.data_aggregation import entity
from py_utils import set_utils 
pd.set_option('display.max_info_rows', 50000000)

In [96]:
sources = [src.GDSC_v2, src.CCLE_v1, src.NCI60_v2]
data_types = ['gene-exome-seq', 'gene-expression', 'gene-copy-number']
m_id = entity.get_raw_entities(sources, data_types, 'CELL_LINE_ID')

2016-11-21 15:07:59,275:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-exome-seq"
2016-11-21 15:07:59,646:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-expression"
2016-11-21 15:08:07,226:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-copy-number"
2016-11-21 15:08:36,789:DEBUG:mgds.data_aggregation.entity: Processing source "ccle_v1", data type "gene-exome-seq"
2016-11-21 15:08:36,849:DEBUG:mgds.data_aggregation.entity: Processing source "ccle_v1", data type "gene-expression"
2016-11-21 15:08:39,618:DEBUG:mgds.data_aggregation.entity: Processing source "ccle_v1", data type "gene-copy-number"
2016-11-21 15:08:43,320:DEBUG:mgds.data_aggregation.entity: Processing source "nci60_v2", data type "gene-exome-seq"
2016-11-21 15:08:43,379:DEBUG:mgds.data_aggregation.entity: Processing source "nci60_v2", data type "gene-expression"
2016-11-21 15:08:43,624:DEBUG:mgds.data_aggregation.entity: Pro

In [97]:
dt = m_id[src.CCLE_v1]['gene-expression']
dt[dt['CELL_LINE_ID'].str.contains('TT')]

Unnamed: 0,CELL_LINE_ID,DATA_TYPE,SOURCE
3551912,CCFSTTG1_CENTRAL_NERVOUS_SYSTEM,gene-expression,ccle_v1
35284928,TT2609C02_THYROID,gene-expression,ccle_v1
35323960,TT_OESOPHAGUS,gene-expression,ccle_v1
35362992,TT_THYROID,gene-expression,ccle_v1


In [98]:
import re
regex = re.compile('[\W_]+')
def clean(x):
    return regex.sub('', x)

def to_mgds_id(source, v):
    fn = None
    if pd.isnull(v):
        return None
    if source == src.NCI60_v2:
        return clean(v).upper().strip()
    if source == src.GDSC_v2:
        return clean(v).upper().strip()
    if source == src.CCLE_v1:
        v = v.upper().strip()
        if v == 'TT_OESOPHAGUS':
            return 'TTOESOPHAGUS'
        if v == 'TT_THYROID':
            return 'TTTHYROID'
        return v.split('_')[0].strip()
    raise ValueError('Cell line normalization for source "{}" has not been implemented yet'.format(source))

def to_mgds_ids(source, ids):
    return [to_mgds_id(source, v) for v in ids]


def aggregate(m_id):
    r = []
    for source in m_id.keys():
        for data_type, d in m_id[source].items():
            c_cl = d.filter(regex='CELL_LINE_ID').columns.tolist()
            d_id = d.copy().assign(MGDS_ID=to_mgds_ids(source, d['CELL_LINE_ID']))
            for c in c_cl:
                taxonomy = c.split(':')[1] if ':' in c else 'COMMON'
                d_pt = d_id.assign(TAXONOMY=taxonomy)[['MGDS_ID', 'DATA_TYPE', 'SOURCE', 'TAXONOMY', c]]
                d_pt = d_pt.rename(columns={c: 'CELL_LINE_ID'})
                r.append(d_pt)
    return pd.concat(r).reset_index(drop=True)

In [90]:
# m_id = aggregate_to_source(m_id_all)
# m_id.keys()

In [99]:
d_id = aggregate(m_id)

In [100]:
d_id.head()

Unnamed: 0,MGDS_ID,DATA_TYPE,SOURCE,TAXONOMY,CELL_LINE_ID
0,1321N1,gene-expression,ccle_v1,COMMON,1321N1_CENTRAL_NERVOUS_SYSTEM
1,143B,gene-expression,ccle_v1,COMMON,143B_BONE
2,22RV1,gene-expression,ccle_v1,COMMON,22RV1_PROSTATE
3,2313287,gene-expression,ccle_v1,COMMON,2313287_STOMACH
4,42MGBA,gene-expression,ccle_v1,COMMON,42MGBA_CENTRAL_NERVOUS_SYSTEM


In [103]:
cts = d_id.groupby(['SOURCE', 'DATA_TYPE', 'TAXONOMY', 'MGDS_ID']).size()
cts.value_counts()

1    8949
2      20
dtype: int64

In [104]:
cts[cts > 1]

SOURCE   DATA_TYPE         TAXONOMY  MGDS_ID        
gdsc_v2  gene-copy-number  COMMON    KMH2               2
                                     TT                 2
                           COSMIC    KMH2               2
                                     TT                 2
         gene-exome-seq    COMMON    KMH2               2
                                     TT                 2
                           COSMIC    KMH2               2
                                     TT                 2
         gene-expression   COMMON    G292CLONEA141B1    2
                                     HEP3B217           2
                                     NTERA2CLD1         2
                                     PC3JPC3            2
                                     TT                 2
                                     UWB1289            2
                           COSMIC    G292CLONEA141B1    2
                                     HEP3B217           2
                   

In [110]:
dt = m_id[src.GDSC_v2]['gene-expression']
dt[dt['CELL_LINE_ID'].fillna('NA').str.replace('-', '').replace('_', '').str.upper().str.contains('HEP')]

Unnamed: 0,CELL_LINE_ID:COSMIC,CELL_LINE_ID,DATA_TYPE,SOURCE
5417309,909719,SK-HEP-1,gene-expression,gdsc_v2
14109390,1240147,Hep 3B2_1-7,gene-expression,gdsc_v2
14109391,1240147,Hep_3B2_1-7,gene-expression,gdsc_v2


In [85]:
def singlestr(x):
    assert len(x) == 1
    return x.iloc[0]
d_id_m = d_id.pivot_table(index='MGDS_ID', columns=['TAXONOMY', 'SOURCE', 'DATA_TYPE'], values='CELL_LINE_ID', aggfunc=singlestr)
d_id_m.head()

TAXONOMY,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COSMIC,COSMIC,COSMIC
SOURCE,ccle_v1,ccle_v1,ccle_v1,gdsc_v2,gdsc_v2,gdsc_v2,nci60_v2,nci60_v2,nci60_v2,gdsc_v2,gdsc_v2,gdsc_v2
DATA_TYPE,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression
MGDS_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
1321N1,1321N1_CENTRAL_NERVOUS_SYSTEM,,1321N1_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,
143B,143B_BONE,,143B_BONE,,,,,,,,,
201T,,,,201T,201T,201T,,,,1287381.0,1287381.0,1287381.0
22RV1,22RV1_PROSTATE,22RV1_PROSTATE,22RV1_PROSTATE,22RV1,22RV1,22RV1,,,,924100.0,924100.0,924100.0
23132-87,,,,23132-87,23132-87,23132-87,,,,910924.0,910924.0,910924.0


In [88]:
d_id_m.sort_index().head(50)

TAXONOMY,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COMMON,COSMIC,COSMIC,COSMIC
SOURCE,ccle_v1,ccle_v1,ccle_v1,gdsc_v2,gdsc_v2,gdsc_v2,nci60_v2,nci60_v2,nci60_v2,gdsc_v2,gdsc_v2,gdsc_v2
DATA_TYPE,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression
MGDS_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
1321N1,1321N1_CENTRAL_NERVOUS_SYSTEM,,1321N1_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,
143B,143B_BONE,,143B_BONE,,,,,,,,,
201T,,,,201T,201T,201T,,,,1287381.0,1287381.0,1287381.0
22RV1,22RV1_PROSTATE,22RV1_PROSTATE,22RV1_PROSTATE,22RV1,22RV1,22RV1,,,,924100.0,924100.0,924100.0
23132-87,,,,23132-87,23132-87,23132-87,,,,910924.0,910924.0,910924.0
2313287,2313287_STOMACH,2313287_STOMACH,2313287_STOMACH,,,,,,,,,
253J,,253J_URINARY_TRACT,,,,,,,,,,
253JBV,,253JBV_URINARY_TRACT,,,,,,,,,,
42-MG-BA,,,,42-MG-BA,42-MG-BA,42-MG-BA,,,,687561.0,687561.0,687561.0
42MGBA,42MGBA_CENTRAL_NERVOUS_SYSTEM,42MGBA_CENTRAL_NERVOUS_SYSTEM,42MGBA_CENTRAL_NERVOUS_SYSTEM,,,,,,,,,


In [87]:
d1 = d_id_m[('COMMON', src.NCI60_v2)].apply(lambda r: r.dropna())
d1.head()

DATA_TYPE,gene-copy-number,gene-exome-seq,gene-expression
MGDS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1321N1,,,
143B,,,
201T,,,
22RV1,,,
23132-87,,,


In [86]:
d_id_m.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1901 entries, 1321N1 to ZR7530
Data columns (total 12 columns):
(COMMON, ccle_v1, gene-copy-number)     995 non-null object
(COMMON, ccle_v1, gene-exome-seq)       904 non-null object
(COMMON, ccle_v1, gene-expression)      967 non-null object
(COMMON, gdsc_v2, gene-copy-number)     996 non-null object
(COMMON, gdsc_v2, gene-exome-seq)       1001 non-null object
(COMMON, gdsc_v2, gene-expression)      981 non-null object
(COMMON, nci60_v2, gene-copy-number)    53 non-null object
(COMMON, nci60_v2, gene-exome-seq)      60 non-null object
(COMMON, nci60_v2, gene-expression)     54 non-null object
(COSMIC, gdsc_v2, gene-copy-number)     996 non-null object
(COSMIC, gdsc_v2, gene-exome-seq)       1001 non-null object
(COSMIC, gdsc_v2, gene-expression)      981 non-null object
dtypes: object(12)
memory usage: 193.1+ KB


In [38]:
d_id = []
for src, d in m_id.items():
    print(src, d.groupby(['MGDS_ID', 'DATA_TYPE']).size().max())
    c_cl = d.filter(regex='CELL_LINE_ID').columns.tolist()
    for c in c_cl:
        print(src, c)
        taxonomy = c.split(':')[1] if ':' in c else 'COMMON'
        d_part = d.set_index(['MGDS_ID', 'DATA_TYPE'])[c].unstack()
        d_part = d_part.add_prefix(taxonomy+':')
        d_id.append(d_part)
d_id = pd.concat(d_id, axis=1, join='outer')
d_id.info()

ccle_v1 1
ccle_v1 CELL_LINE_ID
nci60_v2 1
nci60_v2 CELL_LINE_ID
gdsc_v2 1
gdsc_v2 CELL_LINE_ID


ValueError: Index contains duplicate entries, cannot reshape