In [1]:
%run -m ipy_startup
%run -m ipy_logging false
%matplotlib inline
from mgds.data_aggregation import database as db
from mgds.data_aggregation import source as src
from mgds.data_aggregation import data_type as dtyp
from mgds.data_aggregation import api
from mgds.data_aggregation.import_lib import cgds
from mgds.data_aggregation.import_lib import tcga

In [3]:
cohorts = ['brca']

def add_tcga_raw_data(d, data_type, modifier_fn, cohorts):
    d[data_type] = tcga.load_genetic_profile_data(modifier_fn(data_type), cohorts=cohorts)
    
def get_all_tcga_raw_data(cohorts):
    d = {}
    add_tcga_raw_data(d, dtyp.GENE_EXPRESSION, dtyp.add_normalized_modifier, cohorts)
    add_tcga_raw_data(d, dtyp.GENE_COPY_NUMBER, dtyp.add_putative_modifier, cohorts)
    add_tcga_raw_data(d, dtyp.GENE_METHYLATION, lambda x: x, cohorts)
    add_tcga_raw_data(d, dtyp.GENE_RNA_SEQ, dtyp.add_normalized_modifier, cohorts)
    add_tcga_raw_data(d, dtyp.GENE_RPPA, dtyp.add_normalized_modifier, cohorts)
    add_tcga_raw_data(d, dtyp.GENE_EXOME_SEQ, lambda x: x, cohorts)
    return d

d = get_all_tcga_raw_data(cohorts)

2016-12-20 13:13:54,999:DEBUG:mgds.data_aggregation.import_lib.tcga: Dropped 12696 completely duplicated records from table "brca-gene-expression-normalized"
2016-12-20 13:14:10,570:DEBUG:mgds.data_aggregation.import_lib.tcga: Dropped 38880 completely duplicated records from table "brca-gene-copy-number-putative"
2016-12-20 13:14:29,893:DEBUG:mgds.data_aggregation.import_lib.tcga: Dropped 16546 completely duplicated records from table "brca-gene-methylation"
2016-12-20 13:14:46,263:DEBUG:mgds.data_aggregation.import_lib.tcga: Dropped 27500 completely duplicated records from table "brca-gene-rna-seq-normalized"
2016-12-20 13:15:09,663:DEBUG:mgds.data_aggregation.import_lib.tcga: Dropped 105 completely duplicated records from table "brca-gene-exome-seq"


In [4]:
# Convert exome sequencing data to float value indicating presence of a mutation of some kind
df = d[dtyp.GENE_EXOME_SEQ]
df['VALUE'] = df['VALUE'].notnull().astype(np.float64)
d[dtyp.GENE_EXOME_SEQ] = df
del df

In [5]:
def prep_pivot_tcga_data(d):
    dp = {}
    for dt in d.keys():
        df = d[dt]
        
        # Before pivoting, ensure that there will be only one record per cell
        c_idx = ['COHORT', 'CELL_LINE_ID']
        c_col = ['GENE_ID:HGNC'] 
        mask = df[c_idx + c_col].duplicated()
        assert not np.any(mask), 'Found duplicated records for data type "{}"'.format(dt)
        dp[dt] = df.pivot_table(index=c_idx, columns=c_col, values='VALUE', aggfunc='first'`)
        
    return dp

dp = prep_pivot_tcga_data(d)

In [6]:
df = dp[dtyp.GENE_EXPRESSION]
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 529 entries, (brca, TCGA-A1-A0SD-01) to (brca, TCGA-E2-A1BD-01)
Columns: 16810 entries, A1BG to ZZZ3
dtypes: float64(16810)
memory usage: 67.9+ MB


In [10]:
p = []
for dt1 in dp.keys():
    for dt2 in dp.keys():
        ni = len(dp[dt1].columns.intersection(dp[dt2].columns))
        n = len(dp[dt1].columns)
        p.append((dt1, dt2, ni / n))
p = pd.DataFrame(p, columns=['dt1', 'dt2', 'pct'])
p = p.set_index(['dt1', 'dt2'])['pct'].unstack()
p

dt2,gene-copy-number,gene-exome-seq,gene-expression,gene-methylation,gene-rna-seq,gene-rppa
dt1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gene-copy-number,1.0,0.654626,0.722017,0.674985,0.81432,0.007302
gene-exome-seq,0.986345,1.0,0.891211,0.821188,0.96913,0.009902
gene-expression,1.0,0.819215,1.0,0.853778,0.995122,0.009994
gene-methylation,0.984526,0.794951,0.899135,1.0,0.980767,0.0104
gene-rna-seq,1.0,0.789862,0.882325,0.825729,1.0,0.008967
gene-rppa,0.988372,0.889535,0.976744,0.965116,0.988372,1.0


In [25]:
dp[dtyp.GENE_METHYLATION].isnull().sum(axis=1).value_counts().head()

0    277
1    181
2    101
4     31
3     30
dtype: int64

In [26]:
# dp[dtyp.GENE_RPPA].isnull().mean(axis=0).sort_values()

In [27]:
dp[dtyp.GENE_COPY_NUMBER].info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1080 entries, (brca, TCGA-3C-AAAU-01) to (brca, TCGA-Z7-A8R6-01)
Columns: 23282 entries, A1BG to ZZZ3
dtypes: float64(23282)
memory usage: 191.8+ MB


In [35]:
db.save_obj(dp, src.TCGA_v1, db.PREP, 'raw-data-matrices')

'/Users/eczech/data/research/mgds/prep/tcga_v1_raw-data-matrices.pkl'