In [1]:
import pandas as pd
import numpy as np
import os
import sys
import re

import warnings
warnings.filterwarnings('ignore')

In [8]:
cosmic_GRCh37 = pd.read_table("../raw_data/CosmicCompleteGeneExpression_GRCh37.tsv", sep = '\t')

In [10]:
ens69_gtf = pd.read_table("../raw_data/Homo_sapiens.GRCh37.69.gtf", header = None, sep = '\t')

In [11]:
cosmic_GRCh37_median = cosmic_GRCh37.groupby('GENE_NAME')[['GENE_NAME', 'Z_SCORE']].median()

In [13]:
cosmic_GRCh37_median['GENE_NAME'] = cosmic_GRCh37_median.index

In [15]:
cosmic_GRCh37_median.to_csv('../tables_output/cosmic_tcga_median_Z_SCORE.tsv', sep = '\t', index = False)

In [42]:
cosmic_GRCh37_iqr = cosmic_GRCh37.groupby('GENE_NAME')[['GENE_NAME', 'Z_SCORE']].quantile([.25, .5, .75]).unstack()

In [44]:
cosmic_GRCh37_iqr['GENE_NAME'] = cosmic_GRCh37_iqr.index

In [46]:
cosmic_GRCh37_iqr.columns

MultiIndex(levels=[['Z_SCORE', 'GENE_NAME'], [0.25, 0.5, 0.75, '']],
           labels=[[0, 0, 0, 1], [0, 1, 2, 3]])

In [47]:
cols = ['Q1', 'Q2', 'Q3', 'gene_name']

In [50]:
cosmic_GRCh37_iqr.columns = cols
cosmic_GRCh37_iqr = cosmic_GRCh37_iqr.reset_index(drop = True)

In [51]:
cosmic_GRCh37_iqr.head(2)

Unnamed: 0,Q1,Q2,Q3,gene_name
0,-0.781,-0.242,0.414,39340
1,-0.60125,-0.283,0.242,A1BG


In [55]:
cosmic_GRCh37_iqr['IQR'] = cosmic_GRCh37_iqr['Q3'] - cosmic_GRCh37_iqr['Q1']
cosmic_GRCh37_iqr['lower_1.5IQR'] = cosmic_GRCh37_iqr['Q1'] - 1.5*cosmic_GRCh37_iqr['IQR']
cosmic_GRCh37_iqr['upper_1.5IQR'] = cosmic_GRCh37_iqr['Q3'] + 1.5*cosmic_GRCh37_iqr['IQR']

In [56]:
cosmic_GRCh37_iqr.to_csv('../tables_output/cosmic_GRCh37_iqr.tsv', sep = '\t', index = False)

---

In [16]:
cosmic_GRCh37_summary = cosmic_GRCh37.groupby('GENE_NAME')[['GENE_NAME', 'Z_SCORE']].describe()

In [19]:
cosmic_GRCh37_summary['gene_name'] = cosmic_GRCh37_summary.index

In [22]:
cosmic_GRCh37_summary = cosmic_GRCh37_summary.reset_index(drop = True)

In [35]:
cosmic_GRCh37_summary.columns.droplevel()

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', ''], dtype='object')

In [36]:
cols= ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'gene_name']

In [38]:
cosmic_GRCh37_summary.columns

MultiIndex(levels=[['Z_SCORE', 'gene_name'], ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', '']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 1, 2, 3, 4, 5, 6, 7, 8]])

In [39]:
cosmic_GRCh37_summary.columns = cols

In [40]:
cosmic_GRCh37_summary.head(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,gene_name
0,9144.0,-0.101561,1.197185,-6.089,-0.781,-0.242,0.414,13.453,39340
1,9144.0,0.334597,25.62272,-1.57,-0.60125,-0.283,0.242,2444.419,A1BG


In [41]:
cosmic_GRCh37_summary.to_csv('../tables_output/cosmic_tcga_GRCh37_summary_stats.tsv', sep = '\t', index = False)

In [60]:
cosmic_GRCh37_summary.shape

(18236, 9)

In [57]:
cosmic_GRCh37_summary_full = pd.merge(cosmic_GRCh37_summary,cosmic_GRCh37_iqr, how = 'inner') 

In [59]:
cosmic_GRCh37_summary_full.shape

(18236, 15)

In [61]:
cosmic_GRCh37_summary_full.to_csv('../tables_output/cosmic_tcga_GRCh37_summary_stats_full.tsv', sep = '\t', index = False)

In [70]:
cosmic_GRCh37_summary_full.head(2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,gene_name,Q1,Q2,Q3,IQR,lower_1.5IQR,upper_1.5IQR
0,9144.0,-0.101561,1.197185,-6.089,-0.781,-0.242,0.414,13.453,39340,-0.781,-0.242,0.414,1.195,-2.5735,2.2065
1,9144.0,0.334597,25.62272,-1.57,-0.60125,-0.283,0.242,2444.419,A1BG,-0.60125,-0.283,0.242,0.84325,-1.866125,1.506875


## Transform Z_score so that they are positive 

In [71]:
min_median_Z_SCORE = min(cosmic_GRCh37_summary_full['Q2'])
min_median_Z_SCORE

-1.1345

In [73]:
cosmic_GRCh37_summary_full['Q2_Z_positive'] = cosmic_GRCh37_summary_full['Q2'] - (min_median_Z_SCORE)

In [75]:
# Get Z_SCORE and normalize to it 
total_median_Z_SCORE = sum(cosmic_GRCh37_summary_full['Q2_Z_positive'])
total_median_Z_SCORE

16883.783499999998

In [76]:
cosmic_GRCh37_summary_full['proportion_Z_score'] = cosmic_GRCh37_summary_full['Q2_Z_positive'] / total_median_Z_SCORE

In [79]:
cosmic_GRCh37_summary_full.head(10)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,gene_name,Q1,Q2,Q3,IQR,lower_1.5IQR,upper_1.5IQR,Q2_Z_positive,proportion_Z_score
0,9144.0,-0.101561,1.197185,-6.089,-0.781,-0.242,0.414,13.453,39340,-0.781,-0.242,0.414,1.195,-2.5735,2.2065,0.8925,5.3e-05
1,9144.0,0.334597,25.62272,-1.57,-0.60125,-0.283,0.242,2444.419,A1BG,-0.60125,-0.283,0.242,0.84325,-1.866125,1.506875,0.8515,5e-05
2,9144.0,0.201225,6.548947,-2.224,-0.261,-0.203,-0.096,332.616,A1CF,-0.261,-0.203,-0.096,0.165,-0.5085,0.1515,0.9315,5.5e-05
3,9144.0,-0.078867,1.191187,-2.88,-0.762,-0.305,0.34225,33.196,A2LD1,-0.762,-0.305,0.34225,1.10425,-2.418375,1.998625,0.8295,4.9e-05
4,9144.0,-0.065317,1.012096,-2.209,-0.619,-0.3155,0.207,16.411,A2M,-0.619,-0.3155,0.207,0.826,-1.858,1.446,0.819,4.9e-05
5,9144.0,0.187292,13.502958,-0.96,-0.337,-0.205,-0.085,1283.79,A2ML1,-0.337,-0.205,-0.085,0.252,-0.715,0.293,0.9295,5.5e-05
6,9144.0,-0.044697,1.109348,-2.423,-0.692,-0.327,0.276,15.839,A4GALT,-0.692,-0.327,0.276,0.968,-2.144,1.728,0.8075,4.8e-05
7,9144.0,0.104914,5.25241,-0.884,-0.564,-0.2525,0.152,481.559,A4GNT,-0.564,-0.2525,0.152,0.716,-1.638,1.226,0.882,5.2e-05
8,9144.0,0.088571,1.214318,-3.357,-0.69625,-0.1115,0.65225,16.306,AAAS,-0.69625,-0.1115,0.65225,1.3485,-2.719,2.675,1.023,6.1e-05
9,9144.0,0.012819,1.168298,-2.765,-0.67425,-0.228,0.41,25.542,AACS,-0.67425,-0.228,0.41,1.08425,-2.300625,2.036375,0.9065,5.4e-05


In [78]:
cosmic_GRCh37_summary_full.to_csv('../tables_output/cosmic_tcga_GRCh37_summary_full_median_proportion.tsv', sep = '\t', index = False)

In [80]:
sum(cosmic_GRCh37_summary_full['proportion_Z_score'])

1.0000000000000029