# TCGA Breast Exome Sequencing Data Importation
**Local Version**: 1
**Source Version**: NA

This notebook will import raw TCGA exome sequencing data through the [CGDS](http://www.cbioportal.org/cgds_r.jsp) portal for the study named "Breast Invasive Carcinoma (TCGA, Cell 2015)".

This study is preferred over "Breast Invasive Carcinoma (TCGA, Nature 2012)" despite the fact that it has a little fewer samples because it appears newer and includes more data types.

In [8]:
%run -m ipy_startup
%run -m ipy_logging
%matplotlib inline
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import api
from mgds.data_aggregation.import_lib import cgds
from mgds.data_aggregation.import_lib import tcga_breast
from py_utils import assertion_utils
from py_utils.collection_utils import subset

In [2]:
# pd.set_option('display.max_colwidth', 500)
# dt = cgds.get_genetic_profiles('brca_tcga_pub2015')
# dt.head()

In [3]:
case_list_id = tcga_breast.CASE_LIST_ID
genetic_profile_id = tcga_breast.PROF_MUTATION
batch_size = 50

op = lambda: cgds.get_mutation_data(
    case_list_id, genetic_profile_id,
    api.get_hugo_gene_ids(), gene_id_batch_size=batch_size
)
d = db.cache_raw_operation(op, src.TCGA_BREAST_v1, 'gene-exome-seq')

2016-11-24 08:26:20,589:DEBUG:mgds.data_aggregation.io_utils: Restoring serialized object from "/Users/eczech/data/research/mgds/raw/tcga-breast_v1_gene-exome-seq.pkl"


In [4]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53862 entries, 0 to 20
Data columns (total 22 columns):
entrez_gene_id                 53862 non-null float64
gene_symbol                    53862 non-null object
case_id                        53862 non-null object
sequencing_center              53848 non-null object
mutation_status                0 non-null object
mutation_type                  53862 non-null object
validation_status              0 non-null object
amino_acid_change              53862 non-null object
functional_impact_score        42349 non-null object
xvar_link                      45568 non-null object
xvar_link_pdb                  16427 non-null object
xvar_link_msa                  42404 non-null object
chr                            53862 non-null float64
start_position                 53862 non-null float64
end_position                   53862 non-null float64
reference_allele               53656 non-null object
variant_allele                 53862 non-null obje

In [5]:
c_rm = cgds.DEFAULT_IGNORABLE_MUTATION_COLS
d_exp = cgds.prep_mutation_data(d, c_rm=c_rm)

# Fill in commonly missing fields
d_exp['FUNCTIONAL_IMPACT_SCORE'] = d_exp['FUNCTIONAL_IMPACT_SCORE'].fillna('Unknown')
d_exp['SEQUENCING_CENTER'] = d_exp['SEQUENCING_CENTER'].fillna('Unknown')

# This field is null less than 1% of the time though it appears that "-" is 
# a decent placemark for missing values (based on frequencies of reference and variant allele values)
d_exp['REFERENCE_ALLELE'] = d_exp['REFERENCE_ALLELE'].fillna('-')

d_exp.info()

[Remove duplicate records] Records before = 53862, Records after = 53749, Records removed = 113 (%0.21)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 53749 entries, 0 to 20
Data columns (total 15 columns):
GENE_ID:ENTREZ                53749 non-null int64
GENE_ID:HGNC                  53749 non-null object
CELL_LINE_ID                  53749 non-null object
SEQUENCING_CENTER             53749 non-null object
MUTATION_TYPE                 53749 non-null object
AMINO_ACID_CHANGE             53749 non-null object
FUNCTIONAL_IMPACT_SCORE       53749 non-null object
CHR                           53749 non-null float64
START_POSITION                53749 non-null float64
END_POSITION                  53749 non-null float64
REFERENCE_ALLELE              53749 non-null object
VARIANT_ALLELE                53749 non-null object
REFERENCE_READ_COUNT_TUMOR    53749 non-null float64
VARIANT_READ_COUNT_TUMOR      53749 non-null float64
GENETIC_PROFILE_ID            53749 non-null object
dtypes

In [6]:
d_exp.head()

Unnamed: 0,GENE_ID:ENTREZ,GENE_ID:HGNC,CELL_LINE_ID,SEQUENCING_CENTER,MUTATION_TYPE,AMINO_ACID_CHANGE,FUNCTIONAL_IMPACT_SCORE,CHR,START_POSITION,END_POSITION,REFERENCE_ALLELE,VARIANT_ALLELE,REFERENCE_READ_COUNT_TUMOR,VARIANT_READ_COUNT_TUMOR,GENETIC_PROFILE_ID
0,29974,A1CF,TCGA-D8-A1J8-01,genome.wustl.edu;unc.edu,Missense_Mutation,R244I,M,10.0,52587953.0,52587953.0,C,A,58.0,76.0,brca_tcga_pub2015_mutations
1,29974,A1CF,TCGA-BH-A0HP-01,genome.wustl.edu;unc.edu,Missense_Mutation,A203V,M,10.0,52595854.0,52595854.0,G,A,43.0,22.0,brca_tcga_pub2015_mutations
2,29974,A1CF,TCGA-A8-A09Z-01,genome.wustl.edu;unc.edu,Missense_Mutation,Y127N,L,10.0,52601632.0,52601632.0,A,T,88.0,45.0,brca_tcga_pub2015_mutations
3,29974,A1CF,TCGA-AC-A2FB-01,genome.wustl.edu,Missense_Mutation,G136E,N,10.0,52596055.0,52596055.0,C,T,31.0,5.0,brca_tcga_pub2015_mutations
4,2,A2M,TCGA-D8-A1JK-01,genome.wustl.edu,Nonsense_Mutation,Q1425*,Unknown,12.0,9221429.0,9221429.0,G,A,100.0,13.0,brca_tcga_pub2015_mutations


In [7]:
# Note that records may not necessarily be unique to cell line, gene, and amino acid change
c_unique = ['CELL_LINE_ID', 'GENE_ID:HGNC', 'AMINO_ACID_CHANGE']
cts = d_exp.groupby(c_unique).size()
cts.value_counts()

1    53741
2        4
dtype: int64

In [10]:
assertion_utils.assert_object_types(d_exp)
assert np.all(pd.notnull(d_exp))
db.save(d_exp, src.TCGA_BREAST_v1, db.IMPORT, 'gene-exome-seq')

'/Users/eczech/data/research/mgds/import/tcga-breast_v1_gene-exome-seq.pkl'