# Across-Source Gene Symbol Mapping
**Local Version**: 1
**Source Version**: NA

Maps gene names/symbols across sources to create a global lookup table.

In [1]:
%run -m ipy_startup
%run -m ipy_logging
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import io_utils as io
from mgds.data_aggregation import entity
pd.set_option('display.max_info_rows', 50000000)

In [2]:
sources = [src.GDSC_v2, src.CCLE_v1, src.NCI60_v2, src.NCIDREAM_v1, src.TCGA_BREAST_v1]
data_types = ['gene-exome-seq', 'gene-expression', 'gene-copy-number', 'gene-rna-seq', 'gene-methylation']
m_id = entity.get_raw_entities(sources, data_types, 'GENE_ID')

2016-11-28 15:28:52,290:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-exome-seq"
2016-11-28 15:28:52,678:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-expression"
2016-11-28 15:28:59,863:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-copy-number"
2016-11-28 15:29:31,837:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-rna-seq"
2016-11-28 15:29:31,838:INFO:mgds.data_aggregation.entity: Data for source "gdsc_v2" and data type "gene-rna-seq" does not exist
2016-11-28 15:29:31,839:DEBUG:mgds.data_aggregation.entity: Processing source "gdsc_v2", data type "gene-methylation"
2016-11-28 15:29:31,839:INFO:mgds.data_aggregation.entity: Data for source "gdsc_v2" and data type "gene-methylation" does not exist
2016-11-28 15:29:31,840:DEBUG:mgds.data_aggregation.entity: Processing source "ccle_v1", data type "gene-exome-seq"
2016-11-28 15:29:31,899:DEBUG:mgds.data

In [33]:
# [[(k, k2, v2.columns) for k2, v2 in v.items()] for k, v in m_id.items()]

In [51]:
import re
regex = re.compile('[\W_]+')
def clean(x):
    return x#regex.sub('', x)

def to_mgds_id(source, v):
    fn = None
    if pd.isnull(v):
        return None
    
    # NCI-60 Special Cases
    if source == src.NCI60_v2:
        return clean(v).upper().strip()
    
    # GDSC/COSMIC Special Cases
    if source == src.GDSC_v2:
        return clean(v).upper().strip()
    
    # CCLE Special Cases
    if source == src.CCLE_v1:
        return clean(v).upper().strip()
    
    if source == src.NCIDREAM_v1:
        return clean(v).upper().strip()

    raise ValueError('Cell line normalization for source "{}" has not been implemented yet'.format(source))

def to_mgds_ids(source, ids):
    return [to_mgds_id(source, v) for v in ids]


def aggregate(m_id):
    r = []
    for source in m_id.keys():
        for data_type, d in m_id[source].items():
            c_cl = d.filter(regex='GENE_ID').columns.tolist()
            d_id = d.copy().assign(MGDS_ID=to_mgds_ids(source, d['GENE_ID:HGNC']))
            for c in c_cl:
                taxonomy = c.split(':')[1] if ':' in c else 'COMMON'
                d_pt = d_id.assign(TAXONOMY=taxonomy)[['MGDS_ID', 'DATA_TYPE', 'SOURCE', 'TAXONOMY', c]]
                d_pt = d_pt.rename(columns={c: 'GENE_ID'})
                r.append(d_pt)
    return pd.concat(r).reset_index(drop=True)

In [52]:
d_id = aggregate(m_id)

# Check to make sure that no gene ids from any source/data type seem to match excel dates
from mgds.data_aggregation import excel_utils
date_cts = d_id.groupby(['DATA_TYPE', 'SOURCE'])\
    .apply(lambda g: excel_utils.is_excel_date(g['GENE_ID']).value_counts())
assert True not in date_cts

In [53]:
d_id.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403512 entries, 0 to 403511
Data columns (total 5 columns):
MGDS_ID      403512 non-null object
DATA_TYPE    403512 non-null object
SOURCE       403512 non-null object
TAXONOMY     403512 non-null object
GENE_ID      403512 non-null object
dtypes: object(5)
memory usage: 15.4+ MB


In [54]:
# This should return no results when there are no duplicates
cts = d_id.groupby(['SOURCE', 'DATA_TYPE', 'TAXONOMY', 'MGDS_ID']).size()
print(cts.value_counts())
cts[cts > 1]

1    403512
dtype: int64


Series([], dtype: int64)

In [55]:
def singlestr(x):
    assert len(x) == 1
    return x.iloc[0]
d_id_m = d_id.pivot_table(index='MGDS_ID', columns=['TAXONOMY', 'SOURCE', 'DATA_TYPE'], values='GENE_ID', aggfunc=singlestr)
d_id_m.head()

TAXONOMY,CGDS,CGDS,CGDS,CGDS,ENTREZ,ENTREZ,ENTREZ,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC,HGNC
SOURCE,ccle_v1,ccle_v1,nci60_v2,nci60_v2,ccle_v1,nci60_v2,ncidream_v1,ccle_v1,ccle_v1,ccle_v1,...,gdsc_v2,gdsc_v2,nci60_v2,nci60_v2,nci60_v2,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1
DATA_TYPE,gene-copy-number,gene-expression,gene-copy-number,gene-expression,gene-exome-seq,gene-exome-seq,gene-copy-number,gene-copy-number,gene-exome-seq,gene-expression,...,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-methylation,gene-rna-seq
MGDS_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
13CDNA73,,,,,,,10129.0,,,,...,,,,,,13CDNA73,,,,
15E1.2,,,,,,,283459.0,,,,...,,,,,,15E1.2,,15E1.2,,
182-FIP,,,,,,,57532.0,,,,...,,,,,,182-FIP,,,,
2'-PDE,,,,,,,,,,,...,,,,,,,,2'-PDE,,
2001-03-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,


In [56]:
d_id_m['HGNC'].sort_index().head(50).tail(51)

SOURCE,ccle_v1,ccle_v1,ccle_v1,gdsc_v2,gdsc_v2,gdsc_v2,nci60_v2,nci60_v2,nci60_v2,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1,ncidream_v1
DATA_TYPE,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-copy-number,gene-exome-seq,gene-expression,gene-methylation,gene-rna-seq
MGDS_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
13CDNA73,,,,,,,,,,13CDNA73,,,,
15E1.2,,,,,,,,,,15E1.2,,15E1.2,,
182-FIP,,,,,,,,,,182-FIP,,,,
2'-PDE,,,,,,,,,,,,2'-PDE,,
2001-03-01 00:00:00,,,,2001-03-01 00:00:00,,,,,,,,,,
2001-09-01 00:00:00,,,,2001-09-01 00:00:00,,,,,,,,,,
2001-12-01 00:00:00,,,,2001-12-01 00:00:00,,,,,,,,,,
2002-03-01 00:00:00,,,,2002-03-01 00:00:00,,,,,,,,,,
2002-09-01 00:00:00,,,,2002-09-01 00:00:00,,,,,,,,,,
2003-03-01 00:00:00,,,,2003-03-01 00:00:00,,,,,,,,,,


In [57]:
d_id_m.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71403 entries, 13CDNA73 to ZZZ3
Data columns (total 21 columns):
(CGDS, ccle_v1, gene-copy-number)          20381 non-null object
(CGDS, ccle_v1, gene-expression)           16042 non-null object
(CGDS, nci60_v2, gene-copy-number)         23367 non-null object
(CGDS, nci60_v2, gene-expression)          22370 non-null object
(ENTREZ, ccle_v1, gene-exome-seq)          2165 non-null object
(ENTREZ, nci60_v2, gene-exome-seq)         11203 non-null object
(ENTREZ, ncidream_v1, gene-copy-number)    27230 non-null object
(HGNC, ccle_v1, gene-copy-number)          20381 non-null object
(HGNC, ccle_v1, gene-exome-seq)            2165 non-null object
(HGNC, ccle_v1, gene-expression)           16042 non-null object
(HGNC, gdsc_v2, gene-copy-number)          46221 non-null object
(HGNC, gdsc_v2, gene-exome-seq)            19100 non-null object
(HGNC, gdsc_v2, gene-expression)           17419 non-null object
(HGNC, nci60_v2, gene-copy-number)         2336