# CCLE Exome Sequencing Data Importation
**Local Version**: 1
**Source Version**: NA

This notebook will import raw CCLE exome sequencing data through the [CGDS](http://www.cbioportal.org/cgds_r.jsp) (aka "Cancer Genomic Data Server" portal.  This should not be confused with the [GDSC](http://www.cancerrxgene.org/) portal which is a separate data source entirely.

In [1]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import api
from mgds.data_aggregation.import_lib import ccle
from mgds.data_aggregation.import_lib import cgds
from py_utils.collection_utils import subset

In [2]:
case_list_id = ccle.CASE_LIST_ID
genetic_profile_id = ccle.PROF_MUTATION
batch_size = 50

op = lambda: cgds.get_mutation_data(
    case_list_id, genetic_profile_id,
    api.get_hugo_gene_ids(), gene_id_batch_size=batch_size
)
d = db.cache_raw_operation(op, src.CCLE_v1, 'gene-exome-seq')

2016-11-19 20:45:30,647:DEBUG:mgds.data_aggregation.io_utils: Restoring serialized object from "/Users/eczech/data/research/mgds/raw/ccle_v1_gene-exome-seq.pkl"


In [10]:
# It was confirmed previously that the CGDS endpoint for pivoted genetic data (ie values are CSV lists of 
# amino acid changes) contains the exact same number of gene + cell line combinations as the similar and
# more detailed endpoint for mutations in long format (so it will be used instead)
# d_pivot = pd.read_pickle('/Users/eczech/data/research/musc_genomics/materialized/cgds_genetic_mu.pkl')
# dt = pd.melt(d_pivot, id_vars=['GENE_ID', 'COMMON'], var_name='SAMPLE', value_name='VALUE')
# len(dt[dt['VALUE'].notnull()].groupby(['SAMPLE', 'GENE_ID']).size())
# > 53541

# d_meta = pd.read_pickle('/Users/eczech/data/research/musc_genomics/materialized/cgds_meta_mu.pkl')
# len(d_meta.groupby(['case_id', 'gene_symbol']).size())
# > 53541

In [3]:
d.head()

Unnamed: 0,entrez_gene_id,gene_symbol,case_id,sequencing_center,mutation_status,mutation_type,validation_status,amino_acid_change,functional_impact_score,xvar_link,...,chr,start_position,end_position,reference_allele,variant_allele,reference_read_count_tumor,variant_read_count_tumor,reference_read_count_normal,variant_read_count_normal,genetic_profile_id
0,22848.0,AAK1,KIJK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,broad.mit.edu,,Nonsense_Mutation,,Q743*,,"getma.org/?cm=var&var=hg19,2,69732743,G,A&fts=all",...,2.0,69732743.0,69732743.0,G,A,170.0,40.0,,,cellline_ccle_broad_mutations
1,22848.0,AAK1,REH_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,broad.mit.edu,,Missense_Mutation,,G15D,L,"getma.org/?cm=var&var=hg19,2,69870129,C,T&fts=all",...,2.0,69870129.0,69870129.0,C,T,10.0,12.0,,,cellline_ccle_broad_mutations
2,22848.0,AAK1,HEC108_ENDOMETRIUM,broad.mit.edu,,Missense_Mutation,,Q533H,N,"getma.org/?cm=var&var=hg19,2,69741780,C,G&fts=all",...,2.0,69741780.0,69741780.0,C,G,446.0,129.0,,,cellline_ccle_broad_mutations
3,22848.0,AAK1,RERFLCAD2_LUNG,broad.mit.edu,,Missense_Mutation,,P771R,N,"getma.org/?cm=var&var=hg19,2,69723170,G,C&fts=all",...,2.0,69723170.0,69723170.0,G,C,2.0,21.0,,,cellline_ccle_broad_mutations
4,22848.0,AAK1,NCIH650_LUNG,broad.mit.edu,,Missense_Mutation,,P336T,M,"getma.org/?cm=var&var=hg19,2,69752214,G,T&fts=all",...,2.0,69752214.0,69752214.0,G,T,151.0,99.0,,,cellline_ccle_broad_mutations


In [4]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61846 entries, 0 to 229
Data columns (total 22 columns):
entrez_gene_id                 61846 non-null float64
gene_symbol                    61846 non-null object
case_id                        61846 non-null object
sequencing_center              61846 non-null object
mutation_status                0 non-null object
mutation_type                  61846 non-null object
validation_status              0 non-null object
amino_acid_change              61846 non-null object
functional_impact_score        46392 non-null object
xvar_link                      49796 non-null object
xvar_link_pdb                  22168 non-null object
xvar_link_msa                  46434 non-null object
chr                            61846 non-null float64
start_position                 61846 non-null float64
end_position                   61846 non-null float64
reference_allele               61846 non-null object
variant_allele                 61846 non-null obj

In [5]:
c_rm = cgds.DEFAULT_IGNORABLE_MUTATION_COLS + ['reference_read_count_normal', 'variant_read_count_normal']
d_exp = cgds.prep_mutation_data(d, c_rm=c_rm)
d_exp['FUNCTIONAL_IMPACT_SCORE'] = d_exp['FUNCTIONAL_IMPACT_SCORE'].fillna('Unknown')
d_exp.info()

[Remove duplicate records] Records before = 61846, Records after = 61664, Records removed = 182 (%0.29)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 61664 entries, 0 to 229
Data columns (total 15 columns):
GENE_ID:ENTREZ                61664 non-null int64
GENE_ID:HGNC                  61664 non-null object
CELL_LINE_ID                  61664 non-null object
SEQUENCING_CENTER             61664 non-null object
MUTATION_TYPE                 61664 non-null object
AMINO_ACID_CHANGE             61664 non-null object
FUNCTIONAL_IMPACT_SCORE       61664 non-null object
CHR                           61664 non-null float64
START_POSITION                61664 non-null float64
END_POSITION                  61664 non-null float64
REFERENCE_ALLELE              61664 non-null object
VARIANT_ALLELE                61664 non-null object
REFERENCE_READ_COUNT_TUMOR    61662 non-null float64
VARIANT_READ_COUNT_TUMOR      61664 non-null float64
GENETIC_PROFILE_ID            61664 non-null object
dtype

In [6]:
# At TOW, this field had null values for a miniscule percentage of records
# so they will be mean imputed here
if np.any(d_exp['REFERENCE_READ_COUNT_TUMOR'].isnull()):
    mean_val = d_exp['REFERENCE_READ_COUNT_TUMOR'].mean()
    d_exp['REFERENCE_READ_COUNT_TUMOR'] = d_exp['REFERENCE_READ_COUNT_TUMOR'].fillna(mean_val)

In [7]:
d_exp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61664 entries, 0 to 229
Data columns (total 15 columns):
GENE_ID:ENTREZ                61664 non-null int64
GENE_ID:HGNC                  61664 non-null object
CELL_LINE_ID                  61664 non-null object
SEQUENCING_CENTER             61664 non-null object
MUTATION_TYPE                 61664 non-null object
AMINO_ACID_CHANGE             61664 non-null object
FUNCTIONAL_IMPACT_SCORE       61664 non-null object
CHR                           61664 non-null float64
START_POSITION                61664 non-null float64
END_POSITION                  61664 non-null float64
REFERENCE_ALLELE              61664 non-null object
VARIANT_ALLELE                61664 non-null object
REFERENCE_READ_COUNT_TUMOR    61664 non-null float64
VARIANT_READ_COUNT_TUMOR      61664 non-null float64
GENETIC_PROFILE_ID            61664 non-null object
dtypes: float64(5), int64(1), object(9)
memory usage: 7.5+ MB


In [8]:
# Note that records may not necessarily be unique to cell line, gene, and amino acid change
c_unique = ['CELL_LINE_ID', 'GENE_ID:HGNC', 'AMINO_ACID_CHANGE']
cts = d_exp.groupby(c_unique).size()
cts.value_counts()

1    61662
2        1
dtype: int64

In [9]:
assert np.all(pd.notnull(d_exp))
db.save(d_exp, src.CCLE_v1, db.IMPORT, 'gene-exome-seq')

'/Users/eczech/data/research/mgds/import/ccle_v1_gene-exome-seq.pkl'