In [14]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
from mgds.data_aggregation import entity
from py_utils import set_utils 
pd.set_option('display.max_info_rows', 50000000)

# Entity Mapping Process

- Create mapping by entity type (cell line, gene, cancer type, etc.) with schema like:
    - ID, MGDS_ID, SOURCE, VALUE (where rows are unique to MGDS_ID as well as ID)
    - Each of these should be in table named like **cell_line**, **gene**, **cancer_type**, etc.
- A function should allow for mappings to be "appended" to the existing mappings for a particular entity type by providing a map keyed by MGDS_ID with values also be maps from **source** to value
    - This function should then be able to merge these additions to the existing ones with options to either overwrite existing mappings if they conflict, or throw an error
    
    
## See mgds.data_aggregation.entity.append_entity_mapping

TODO: Update and test this function

In [17]:
def get_table_summary(sources, data_types, col_regex):
    r = {}
    for source in sources:
        for data_type in data_types:
            if source not in r:
                r[source] = {}
            logger.debug('Processing source "{}", data type "{}"'.format(source, data_type))
            if not db.exists(source, db.IMPORT, data_type):
                logger.info('Data for source "{}" and data type "{}" does not exist'.format(source, data_type))
                continue
            r[source][data_type] = db.load(source, db.IMPORT, data_type)\
                .filter(regex=col_regex)\
                .drop_duplicates()\
                .assign(SOURCE=source, DATA_TYPE=data_type)
    return r


def get_all_ids(sources, data_types):
    return get_table_summary(sources, data_types, 'CELL_LINE_ID|GENE_ID')


def get_cell_line_ids(sources, data_types):
    return get_table_summary(sources, data_types, 'CELL_LINE_ID')

In [18]:
sources = [src.GDSC_v2, src.CCLE_v1, src.NCI60_v2]
data_types = ['gene-exome-seq', 'gene-expression', 'gene-copy-number']

In [19]:
m_id = get_cell_line_ids(sources, data_types)

2016-11-21 09:52:11,712:DEBUG:root: Processing source "gdsc_v2", data type "gene-exome-seq"
2016-11-21 09:52:12,172:DEBUG:root: Processing source "gdsc_v2", data type "gene-expression"
2016-11-21 09:52:23,968:DEBUG:root: Processing source "gdsc_v2", data type "gene-copy-number"
2016-11-21 09:52:51,826:DEBUG:root: Processing source "ccle_v1", data type "gene-exome-seq"
2016-11-21 09:52:51,905:DEBUG:root: Processing source "ccle_v1", data type "gene-expression"
2016-11-21 09:52:54,891:DEBUG:root: Processing source "ccle_v1", data type "gene-copy-number"
2016-11-21 09:52:58,696:DEBUG:root: Processing source "nci60_v2", data type "gene-exome-seq"
2016-11-21 09:52:58,765:DEBUG:root: Processing source "nci60_v2", data type "gene-expression"
2016-11-21 09:52:59,065:DEBUG:root: Processing source "nci60_v2", data type "gene-copy-number"


In [20]:
m_id.keys()

dict_keys(['gdsc_v2', 'nci60_v2', 'ccle_v1'])

In [21]:
d1 = m_id['gdsc_v2']['gene-expression']
d2 = m_id['ccle_v1']['gene-expression']
d3 = m_id['nci60_v2']['gene-expression']

In [22]:
d1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1019 entries, 0 to 17732542
Data columns (total 4 columns):
CELL_LINE_ID:COSMIC    1019 non-null object
CELL_LINE_ID           981 non-null object
DATA_TYPE              1019 non-null object
SOURCE                 1019 non-null object
dtypes: object(4)
memory usage: 39.8+ KB


In [23]:
d2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 967 entries, 0 to 39110064
Data columns (total 3 columns):
CELL_LINE_ID    967 non-null object
DATA_TYPE       967 non-null object
SOURCE          967 non-null object
dtypes: object(3)
memory usage: 30.2+ KB


In [24]:
d3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54 entries, 0 to 1185539
Data columns (total 3 columns):
CELL_LINE_ID    54 non-null object
DATA_TYPE       54 non-null object
SOURCE          54 non-null object
dtypes: object(3)
memory usage: 1.7+ KB


In [36]:
def pivot(m_id):
    r = []
    for src, m in m_id.items():
        for typ, v in m_id[src].items():
            c = v.filter(regex='CELL_LINE_ID').columns.tolist()
            v = v.set_index('CELL_LINE_ID', drop=False)[c].add_prefix(src+':'+typ+':')
            r.append(v)
    return pd.concat(r, axis=1, join='outer')

In [37]:
dm = pivot(m_id)
dm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2121 entries, 201T to nan
Data columns (total 12 columns):
gdsc_v2:gene-exome-seq:CELL_LINE_ID             1001 non-null object
gdsc_v2:gene-exome-seq:CELL_LINE_ID:COSMIC      1001 non-null float64
gdsc_v2:gene-copy-number:CELL_LINE_ID           996 non-null object
gdsc_v2:gene-copy-number:CELL_LINE_ID:COSMIC    996 non-null object
gdsc_v2:gene-expression:CELL_LINE_ID:COSMIC     1019 non-null object
gdsc_v2:gene-expression:CELL_LINE_ID            981 non-null object
nci60_v2:gene-exome-seq:CELL_LINE_ID            60 non-null object
nci60_v2:gene-copy-number:CELL_LINE_ID          53 non-null object
nci60_v2:gene-expression:CELL_LINE_ID           54 non-null object
ccle_v1:gene-exome-seq:CELL_LINE_ID             904 non-null object
ccle_v1:gene-copy-number:CELL_LINE_ID           995 non-null object
ccle_v1:gene-expression:CELL_LINE_ID            967 non-null object
dtypes: float64(1), object(11)
memory usage: 215.4+ KB


In [38]:
dm.head()

Unnamed: 0,gdsc_v2:gene-exome-seq:CELL_LINE_ID,gdsc_v2:gene-exome-seq:CELL_LINE_ID:COSMIC,gdsc_v2:gene-copy-number:CELL_LINE_ID,gdsc_v2:gene-copy-number:CELL_LINE_ID:COSMIC,gdsc_v2:gene-expression:CELL_LINE_ID:COSMIC,gdsc_v2:gene-expression:CELL_LINE_ID,nci60_v2:gene-exome-seq:CELL_LINE_ID,nci60_v2:gene-copy-number:CELL_LINE_ID,nci60_v2:gene-expression:CELL_LINE_ID,ccle_v1:gene-exome-seq:CELL_LINE_ID,ccle_v1:gene-copy-number:CELL_LINE_ID,ccle_v1:gene-expression:CELL_LINE_ID
201T,201T,1287381.0,201T,1287381.0,1287381.0,201T,,,,,,
22RV1,22RV1,924100.0,22RV1,924100.0,924100.0,22RV1,,,,,,
22RV1_PROSTATE,,,,,,,,,,22RV1_PROSTATE,22RV1_PROSTATE,22RV1_PROSTATE
23132-87,23132-87,910924.0,23132-87,910924.0,910924.0,23132-87,,,,,,
42-MG-BA,42-MG-BA,687561.0,42-MG-BA,687561.0,687561.0,42-MG-BA,,,,,,


In [15]:
import imp
imp.reload(entity)

<module 'mgds.data_aggregation.entity' from '/Users/eczech/repos/mgds/python/src/mgds/data_aggregation/entity.py'>

In [16]:
d_m = entity.emtpy_mapping_frame()
d_m

Unnamed: 0,MGDS_ID,MGDS_NAME,SOURCE,VALUE


In [None]:
d_m.append()

In [None]:
d_mu = entity.append_entity_mapping(entity.CELL_LINE, d_m)

In [10]:
d1['CELL_LINE_ID'].head()

0    CAL-120
1    CAL-120
2    CAL-120
3    CAL-120
4    CAL-120
Name: CELL_LINE_ID, dtype: object

In [11]:
d2['CELL_LINE_ID'].head()

0    1321N1_CENTRAL_NERVOUS_SYSTEM
2    1321N1_CENTRAL_NERVOUS_SYSTEM
3    1321N1_CENTRAL_NERVOUS_SYSTEM
4    1321N1_CENTRAL_NERVOUS_SYSTEM
5    1321N1_CENTRAL_NERVOUS_SYSTEM
Name: CELL_LINE_ID, dtype: object

In [17]:
d2['CELL_LINE_ID'].unique()

array(['1321N1_CENTRAL_NERVOUS_SYSTEM', '143B_BONE', '22RV1_PROSTATE',
       '2313287_STOMACH', '42MGBA_CENTRAL_NERVOUS_SYSTEM',
       '5637_URINARY_TRACT', '59M_OVARY', '639V_URINARY_TRACT',
       '647V_URINARY_TRACT', '697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE',
       '769P_KIDNEY', '786O_KIDNEY', '8305C_THYROID', '8505C_THYROID',
       '8MGBA_CENTRAL_NERVOUS_SYSTEM', 'A101D_SKIN',
       'A172_CENTRAL_NERVOUS_SYSTEM', 'A204_SOFT_TISSUE', 'A2058_SKIN',
       'A253_SALIVARY_GLAND', 'A2780_OVARY', 'A375_SKIN',
       'A3KAW_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', 'A498_KIDNEY',
       'A4FUK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', 'A549_LUNG',
       'A673_BONE', 'A704_KIDNEY', 'ABC1_LUNG', 'ACCMESO1_PLEURA',
       'ACHN_KIDNEY', 'AGS_STOMACH', 'ALEXANDERCELLS_LIVER',
       'AM38_CENTRAL_NERVOUS_SYSTEM',
       'AML193_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE',
       'AMO1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', 'AN3CA_ENDOMETRIUM',
       'ASPC1_PANCREAS', 'AU565_BREAST', 'AZ521_STOMACH',
    

In [15]:
d3['CELL_LINE_ID'].unique()

array(['786_0', 'A498', 'A549', 'ACHN', 'BT_549', 'CAKI_1', 'CCRF_CEM',
       'COLO205', 'DU_145', 'EKVX', 'HCC_2998', 'HCT_116', 'HCT_15',
       'HL_60', 'HOP_62', 'HOP_92', 'HS578T', 'HT29', 'IGROV1', 'KM12',
       'K_562', 'LOXIMVI', 'M14', 'MALME_3M', 'MCF7', 'MDA_MB_231',
       'MDA_MB_435', 'MDA_N', 'MOLT_4', 'NCI_ADR_RES', 'NCI_H226',
       'NCI_H23', 'NCI_H322M', 'NCI_H460', 'NCI_H522', 'OVCAR_3',
       'OVCAR_4', 'OVCAR_5', 'OVCAR_8', 'PC_3', 'RPMI_8226', 'RXF_393',
       'SK_MEL_2', 'SK_MEL_28', 'SK_MEL_5', 'SK_OV_3', 'SN12C', 'SR',
       'SW_620', 'T47D', 'TK_10', 'UACC_257', 'UACC_62', 'UO_31'], dtype=object)

In [19]:
d_cl_map = pd.DataFrame({'MGDS_ID': []})
d_cl_map

Unnamed: 0,MGDS_ID


In [None]:
d_add = pd.DataFrame({
    'ID': [1, 2], 
    'MGDS_ID': ['NCIH460', 'NCIH522', 'NCIH460', 'NCIH522'], 
    'SOURCE': ['nci60_v2', 'nci60_v2'], 
    'VALUE': ['NCI_H460', 'NCI_H522']
})

In [20]:
set_utils.analyze_sets(d1['CELL_LINE_ID'].unique(), d2['CELL_LINE_ID'].unique())

{'Stats': {'All': '1949 (100%)',
  'InBoth': '0 (0.00%)',
  'InOnlyOne': '1949 (100.00%)',
  'InOnlySet1': '982 (50.38%)',
  'InOnlySet2': '967 (49.62%)'}}

In [7]:
d_id.groupby(['DATA_TYPE', 'SOURCE']).size()

DATA_TYPE         SOURCE  
gene-copy-number  ccle_v1     20279095
                  gdsc_v2     46036116
                  nci60_v2     1238451
gene-exome-seq    ccle_v1        53541
                  gdsc_v2       437544
gene-expression   ccle_v1     15512614
                  gdsc_v2     17749961
                  nci60_v2     1207909
dtype: int64