# EDA - Pipeline Result Aggregation

In [8]:
import pandas as pd
import seaborn as sns
import numpy as np
import os

PIPELINE_DIR = '/Users/eczech/projects/hammer/cache/pipeline'

def pipeline_path(filename):
    return os.path.join(PIPELINE_DIR, filename)

In [41]:
d_gene = pd.read_csv(pipeline_path('gene_meta.csv'))
d_gene = d_gene[[
    'Gene', 'Gene synonym', 'Ensembl', 'Chromosome', 
    'RNA tissue category', 'RNA TS', 'RNA TS TPM',
    'Protein classes'
]].rename(columns=lambda c: c.title().replace(' ', ''))
d_gene.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6973 entries, 0 to 6972
Data columns (total 8 columns):
Gene                 6973 non-null object
GeneSynonym          5868 non-null object
Ensembl              6973 non-null object
Chromosome           6973 non-null object
RnaTissueCategory    6973 non-null object
RnaTs                3122 non-null float64
RnaTsTpm             3122 non-null object
ProteinClasses       6973 non-null object
dtypes: float64(1), object(7)
memory usage: 435.9+ KB


In [42]:
cts = d_gene['Gene'].value_counts()
dupe_genes = cts[cts>1].index.values
d_gene[d_gene['Gene'].isin(dupe_genes)]

Unnamed: 0,Gene,GeneSynonym,Ensembl,Chromosome,RnaTissueCategory,RnaTs,RnaTsTpm,ProteinClasses
1809,CRHR1,"CRF-R, CRF1, CRHR",ENSG00000120088,17,Tissue enhanced,0.0,cerebral cortex: 4.0,"('Predicted secreted proteins', 'G-protein cou..."
3549,TMBIM4,"CGI-119, GAAP, LFG4, S1R, ZPRO",ENSG00000155957,12,Expressed in all,,,"('Predicted intracellular proteins', 'Transpor..."
6841,CRHR1,,ENSG00000263715,17,Tissue enriched,9.0,cerebral cortex: 1.1,"('Predicted membrane proteins',)"
6967,TMBIM4,,ENSG00000282031,12,Expressed in all,,,"('Predicted membrane proteins',)"


In [43]:
d_gene = d_gene[~d_gene['Gene'].isin(dupe_genes) | d_gene['GeneSynonym'].notnull()]

In [44]:
assert not d_gene['Gene'].duplicated().any()

In [18]:
d_exp = pd.read_csv(pipeline_path('expression_data.csv'))
d_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7031200 entries, 0 to 7031199
Data columns (total 5 columns):
StudyId     object
GeneId      int64
Gene        object
SampleId    object
Value       float64
dtypes: float64(1), int64(1), object(3)
memory usage: 268.2+ MB


In [40]:
assert not d_exp[['StudyId', 'Gene', 'SampleId']].duplicated().any().any()

In [52]:
d_exp_stat = (
    d_exp.groupby(['StudyId', 'Gene'])['Value']
    .describe(percentiles=list(np.arange(.1, 1, .1)) + [.95, .99])
).rename(columns=lambda c: c.replace('.0', '').title())
d_exp_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,95%,99%,max
StudyId,Gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
brca_tcga,A2M,1100.0,0.05182,1.363112,-1.0588,-0.7215,-0.59392,-0.48755,-0.36928,-0.23795,-0.0914,0.08639,0.37478,0.79726,1.40161,5.956341,18.1618
brca_tcga,A4GALT,1100.0,-0.023173,1.208878,-1.1802,-0.94597,-0.81674,-0.66042,-0.48544,-0.31505,-0.11122,0.14496,0.48208,1.05783,1.83109,4.053255,13.3504
brca_tcga,AACS,1100.0,0.027011,1.376142,-1.4562,-0.84663,-0.6567,-0.48878,-0.37268,-0.2289,-0.06692,0.14003,0.47028,1.0058,1.552025,3.956352,27.5
brca_tcga,AADACL3,1100.0,0.110695,2.393671,-0.1546,-0.1546,-0.1546,-0.1546,-0.1546,-0.1546,-0.1546,-0.1546,-0.1546,0.15165,0.5327,3.813125,63.2633
brca_tcga,AADACL4,1100.0,-0.011497,0.994809,-0.2338,-0.2338,-0.2338,-0.2338,-0.2338,-0.2338,-0.2338,-0.2338,-0.2338,-0.2338,1.57204,4.897099,12.7107


In [58]:
d = pd.merge(
    d_gene.set_index('Gene').add_prefix('Meta:').reset_index(),
    d_exp_stat.add_prefix('Stat:').reset_index(level='StudyId').reset_index(),
    how='inner',
    on='Gene'
)
d.head()

Unnamed: 0,Gene,Meta:GeneSynonym,Meta:Ensembl,Meta:Chromosome,Meta:RnaTissueCategory,Meta:RnaTs,Meta:RnaTsTpm,Meta:ProteinClasses,StudyId,Stat:count,...,Stat:30%,Stat:40%,Stat:50%,Stat:60%,Stat:70%,Stat:80%,Stat:90%,Stat:95%,Stat:99%,Stat:max
0,TSPAN6,"T245, TM4SF6, TSPAN-6",ENSG00000000003,X,Mixed,,,"('Predicted intracellular proteins', 'Predicte...",brca_tcga,1100.0,...,-0.6069,-0.41922,-0.2053,0.04008,0.32853,0.69582,1.40347,2.02459,3.321086,23.623
1,TNMD,"BRICD4, ChM1L, myodulin, TEM, tendin",ENSG00000000005,X,Tissue enhanced,0.0,adipose tissue: 10.1;seminal vesicle: 32.9,"('Predicted membrane proteins',)",brca_tcga,1100.0,...,-0.0676,-0.0663,-0.0641,-0.0603,-0.0526,-0.03592,0.00056,0.05987,0.313324,26.9921
2,CFH,"ARMD4, ARMS1, FHL1, HF, HF1, HF2, HUS",ENSG00000000971,1,Tissue enhanced,0.0,liver: 838.9,"('Cancer-related genes', 'Predicted secreted p...",brca_tcga,1100.0,...,-0.42449,-0.32032,-0.2354,-0.12146,-0.00405,0.1671,0.44898,0.800265,1.792808,12.5099
3,GCLC,"GCS, GLCL, GLCLC",ENSG00000001084,6,Expressed in all,,,"('Enzymes', 'Predicted membrane proteins', 'Pl...",brca_tcga,1100.0,...,-0.5771,-0.43612,-0.2551,-0.08088,0.1335,0.4834,1.17706,1.92946,3.897708,10.1985
4,NIPAL3,"DJ462O23.2, NPAL3",ENSG00000001461,1,Expressed in all,,,"('Predicted intracellular proteins', 'Transpor...",brca_tcga,1100.0,...,-0.79816,-0.64802,-0.46735,-0.24504,-0.01158,0.31056,0.91338,1.55559,3.842587,7.6737


In [53]:
d = pd.concat([
    d_gene.set_index('Gene').add_prefix('Meta:'),
    d_exp_stat.add_prefix('Stat:').reset_index(level='StudyId').rename(columns={'StudyId': 'Meta:StudyId'})
], axis=1, join='inner')
d.index.name = 'Meta:Gene'
d = d.reset_index()
d.head()

Unnamed: 0,Meta:Gene,Meta:GeneSynonym,Meta:Ensembl,Meta:Chromosome,Meta:RnaTissueCategory,Meta:RnaTs,Meta:RnaTsTpm,Meta:ProteinClasses,Meta:StudyId,Stat:count,...,Stat:30%,Stat:40%,Stat:50%,Stat:60%,Stat:70%,Stat:80%,Stat:90%,Stat:95%,Stat:99%,Stat:max
0,TSPAN6,"T245, TM4SF6, TSPAN-6",ENSG00000000003,X,Mixed,,,"('Predicted intracellular proteins', 'Predicte...",brca_tcga,1100.0,...,-0.6069,-0.41922,-0.2053,0.04008,0.32853,0.69582,1.40347,2.02459,3.321086,23.623
1,TNMD,"BRICD4, ChM1L, myodulin, TEM, tendin",ENSG00000000005,X,Tissue enhanced,0.0,adipose tissue: 10.1;seminal vesicle: 32.9,"('Predicted membrane proteins',)",brca_tcga,1100.0,...,-0.0676,-0.0663,-0.0641,-0.0603,-0.0526,-0.03592,0.00056,0.05987,0.313324,26.9921
2,CFH,"ARMD4, ARMS1, FHL1, HF, HF1, HF2, HUS",ENSG00000000971,1,Tissue enhanced,0.0,liver: 838.9,"('Cancer-related genes', 'Predicted secreted p...",brca_tcga,1100.0,...,-0.42449,-0.32032,-0.2354,-0.12146,-0.00405,0.1671,0.44898,0.800265,1.792808,12.5099
3,GCLC,"GCS, GLCL, GLCLC",ENSG00000001084,6,Expressed in all,,,"('Enzymes', 'Predicted membrane proteins', 'Pl...",brca_tcga,1100.0,...,-0.5771,-0.43612,-0.2551,-0.08088,0.1335,0.4834,1.17706,1.92946,3.897708,10.1985
4,NIPAL3,"DJ462O23.2, NPAL3",ENSG00000001461,1,Expressed in all,,,"('Predicted intracellular proteins', 'Transpor...",brca_tcga,1100.0,...,-0.79816,-0.64802,-0.46735,-0.24504,-0.01158,0.31056,0.91338,1.55559,3.842587,7.6737


In [54]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6325 entries, 0 to 6324
Data columns (total 25 columns):
Meta:Gene                 6325 non-null object
Meta:GeneSynonym          5500 non-null object
Meta:Ensembl              6325 non-null object
Meta:Chromosome           6325 non-null object
Meta:RnaTissueCategory    6325 non-null object
Meta:RnaTs                2844 non-null float64
Meta:RnaTsTpm             2844 non-null object
Meta:ProteinClasses       6325 non-null object
Meta:StudyId              6325 non-null object
Stat:count                6325 non-null float64
Stat:mean                 6325 non-null float64
Stat:std                  6325 non-null float64
Stat:min                  6325 non-null float64
Stat:10%                  6325 non-null float64
Stat:20%                  6325 non-null float64
Stat:30%                  6325 non-null float64
Stat:40%                  6325 non-null float64
Stat:50%                  6325 non-null float64
Stat:60%                  6325 non-nu