# TCGA Exome Sequencing Data Importation
**Local Version**: 1
**Source Version**: NA

This notebook will import raw TCGA mutation data through the [CGDS](http://www.cbioportal.org/cgds_r.jsp) portal.

In [1]:
%run -m ipy_startup
%run -m ipy_logging false
%matplotlib inline
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import data_type as dtyp
from mgds.data_aggregation import api
from mgds.data_aggregation.import_lib import cgds
from mgds.data_aggregation.import_lib import tcga
from py_utils import assertion_utils
from py_utils.collection_utils import subset

In [2]:
tables = tcga.import_genetic_profile_data(
    profile_fmt=tcga.PROF_FMT_MUTATIONS,
    data_type=dtyp.GENE_EXOME_SEQ,
    gene_ids=api.get_hugo_gene_ids(),
    cohorts=['brca']
)

2016-12-19 09:07:41,465:INFO:mgds.data_aggregation.import_lib.tcga: Importing data for study "brca_tcga" (3 of 32), cohort "brca", case list "brca_tcga_all", profile "brca_tcga_mutations", table "brca-gene-exome-seq"
2016-12-19 09:07:41,467:INFO:mgds.data_aggregation.import_lib.cgds: Processing batch 1 of 789
2016-12-19 09:10:59,866:INFO:mgds.data_aggregation.import_lib.cgds: Processing batch 79 of 789
2016-12-19 09:14:39,630:INFO:mgds.data_aggregation.import_lib.cgds: Processing batch 157 of 789
2016-12-19 09:18:53,020:INFO:mgds.data_aggregation.import_lib.cgds: Processing batch 235 of 789
2016-12-19 09:23:34,056:INFO:mgds.data_aggregation.import_lib.cgds: Processing batch 313 of 789
2016-12-19 09:28:28,297:INFO:mgds.data_aggregation.import_lib.cgds: Processing batch 391 of 789
2016-12-19 09:35:33,444:INFO:mgds.data_aggregation.import_lib.cgds: Processing batch 469 of 789
2016-12-19 09:40:50,109:INFO:mgds.data_aggregation.import_lib.cgds: Processing batch 547 of 789
2016-12-19 09:45:5

In [16]:
import imp
imp.reload(tcga)

<module 'mgds.data_aggregation.import_lib.tcga' from '/Users/eczech/repos/mgds/python/src/mgds/data_aggregation/import_lib/tcga.py'>

In [17]:
d = tcga.load_genetic_profile_data(dtyp.GENE_EXOME_SEQ, cohorts=['brca'])



In [18]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58818 entries, 899 to 43129779
Data columns (total 5 columns):
GENE_ID         58818 non-null int64
COMMON          58818 non-null object
CELL_LINE_ID    58818 non-null object
VALUE           58818 non-null object
COHORT          58818 non-null object
dtypes: int64(1), object(4)
memory usage: 2.7+ MB


In [19]:
d['VALUE'].isnull().value_counts()

False    58818
Name: VALUE, dtype: int64

In [20]:
len(d['CELL_LINE_ID'].unique())

981

In [15]:
len(ids)

981

In [10]:
d['VALUE'][d['VALUE'].notnull()].head()

899           T212K
2052         Q2118*
2413          R556K
2544    S1417Kfs*40
2587          R332K
Name: VALUE, dtype: object

In [5]:
c_rm = cgds.DEFAULT_IGNORABLE_MUTATION_COLS
d_exp = cgds.prep_mutation_data(d, c_rm=c_rm)

# Fill in commonly missing fields
d_exp['FUNCTIONAL_IMPACT_SCORE'] = d_exp['FUNCTIONAL_IMPACT_SCORE'].fillna('Unknown')
d_exp['SEQUENCING_CENTER'] = d_exp['SEQUENCING_CENTER'].fillna('Unknown')

# This field is null less than 1% of the time though it appears that "-" is 
# a decent placemark for missing values (based on frequencies of reference and variant allele values)
d_exp['REFERENCE_ALLELE'] = d_exp['REFERENCE_ALLELE'].fillna('-')

d_exp.info()

[Remove duplicate records] Records before = 53862, Records after = 53749, Records removed = 113 (%0.21)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 53749 entries, 0 to 20
Data columns (total 15 columns):
GENE_ID:ENTREZ                53749 non-null int64
GENE_ID:HGNC                  53749 non-null object
CELL_LINE_ID                  53749 non-null object
SEQUENCING_CENTER             53749 non-null object
MUTATION_TYPE                 53749 non-null object
AMINO_ACID_CHANGE             53749 non-null object
FUNCTIONAL_IMPACT_SCORE       53749 non-null object
CHR                           53749 non-null float64
START_POSITION                53749 non-null float64
END_POSITION                  53749 non-null float64
REFERENCE_ALLELE              53749 non-null object
VARIANT_ALLELE                53749 non-null object
REFERENCE_READ_COUNT_TUMOR    53749 non-null float64
VARIANT_READ_COUNT_TUMOR      53749 non-null float64
GENETIC_PROFILE_ID            53749 non-null object
dtypes

In [6]:
d_exp.head()

Unnamed: 0,GENE_ID:ENTREZ,GENE_ID:HGNC,CELL_LINE_ID,SEQUENCING_CENTER,MUTATION_TYPE,AMINO_ACID_CHANGE,FUNCTIONAL_IMPACT_SCORE,CHR,START_POSITION,END_POSITION,REFERENCE_ALLELE,VARIANT_ALLELE,REFERENCE_READ_COUNT_TUMOR,VARIANT_READ_COUNT_TUMOR,GENETIC_PROFILE_ID
0,29974,A1CF,TCGA-D8-A1J8-01,genome.wustl.edu;unc.edu,Missense_Mutation,R244I,M,10.0,52587953.0,52587953.0,C,A,58.0,76.0,brca_tcga_pub2015_mutations
1,29974,A1CF,TCGA-BH-A0HP-01,genome.wustl.edu;unc.edu,Missense_Mutation,A203V,M,10.0,52595854.0,52595854.0,G,A,43.0,22.0,brca_tcga_pub2015_mutations
2,29974,A1CF,TCGA-A8-A09Z-01,genome.wustl.edu;unc.edu,Missense_Mutation,Y127N,L,10.0,52601632.0,52601632.0,A,T,88.0,45.0,brca_tcga_pub2015_mutations
3,29974,A1CF,TCGA-AC-A2FB-01,genome.wustl.edu,Missense_Mutation,G136E,N,10.0,52596055.0,52596055.0,C,T,31.0,5.0,brca_tcga_pub2015_mutations
4,2,A2M,TCGA-D8-A1JK-01,genome.wustl.edu,Nonsense_Mutation,Q1425*,Unknown,12.0,9221429.0,9221429.0,G,A,100.0,13.0,brca_tcga_pub2015_mutations


In [7]:
# Note that records may not necessarily be unique to cell line, gene, and amino acid change
c_unique = ['CELL_LINE_ID', 'GENE_ID:HGNC', 'AMINO_ACID_CHANGE']
cts = d_exp.groupby(c_unique).size()
cts.value_counts()

1    53741
2        4
dtype: int64

In [10]:
assertion_utils.assert_object_types(d_exp)
assert np.all(pd.notnull(d_exp))
db.save(d_exp, src.TCGA_BREAST_v1, db.IMPORT, 'gene-exome-seq')

'/Users/eczech/data/research/mgds/import/tcga-breast_v1_gene-exome-seq.pkl'