## Compute cancer-type-specific differential expression scores for each gene

Compares tumor to normal tissue in the same patient.

In [1]:
import os

import pandas
import numpy
from scipy.stats import ttest_1samp

In [2]:
path = os.path.join('data', 'complete', 'expression-matrix.tsv.bz2')
expr_df = pandas.read_table(path, index_col=0)

In [3]:
path = os.path.join('data', 'complete', 'samples.tsv')
sample_df = (
    pandas.read_table(path)
    # Filter for samples with expression
    .query("sample_id in @expr_df.index")
)
patient_df = sample_df[['patient_id', 'acronym']].drop_duplicates()
sample_df.head(2)

Unnamed: 0,sample_id,patient_id,sample_type,disease,acronym,organ_of_origin,gender,age_diagnosed,dead,days_survived,recurred,days_recurrence_free
35,TCGA-02-0047-01,TCGA-02-0047,Primary Tumor,glioblastoma multiforme,GBM,Brain,Male,78.0,1.0,448.0,,
40,TCGA-02-0055-01,TCGA-02-0055,Primary Tumor,glioblastoma multiforme,GBM,Brain,Female,62.0,1.0,76.0,,


In [4]:
type_df = sample_df.pivot('patient_id', 'sample_type', values='sample_id')
type_df = type_df[['Primary Tumor', 'Solid Tissue Normal']]
# Filter for paired samples
type_df = type_df[type_df.isnull().sum(axis='columns') == 0]
type_df = type_df.reset_index().merge(patient_df)
type_df.head(2)

sample_type,patient_id,Primary Tumor,Solid Tissue Normal,acronym
0,TCGA-22-4593,TCGA-22-4593-01,TCGA-22-4593-11,LUSC
1,TCGA-22-4609,TCGA-22-4609-01,TCGA-22-4609-11,LUSC


In [5]:
def get_diffex(subtype_df):
    """
    For each gene, compute differential expression between paired tumor and normal tissue.
    """
    tumor_df = expr_df.loc[list(subtype_df['Primary Tumor']), :]
    normal_df = expr_df.loc[list(subtype_df['Solid Tissue Normal']), :]
    for df in tumor_df, normal_df:
        df.index = subtype_df.index
    
    diffex_df = tumor_df - normal_df
    ttest = ttest_1samp(diffex_df, popmean=0, axis=0)

    df = pandas.DataFrame.from_items([
        ('entrez_gene_id', diffex_df.columns.astype(int)),
        ('patients', len(diffex_df)),
        ('tumor_mean', tumor_df.mean()),
        ('normal_mean', normal_df.mean()),
        ('mean_diff', diffex_df.mean()),
        ('t_stat', ttest.statistic),
        ('mlog10_p_value', -numpy.log10(ttest.pvalue)),
    ])
    return df

diffex_df = type_df.groupby('acronym').apply(get_diffex).reset_index('acronym')

In [6]:
# Add gene symbols
path = os.path.join('data', 'genes.tsv')
gene_df = pandas.read_table(path)
gene_df = gene_df[['entrez_gene_id', 'symbol']]
diffex_df = gene_df.merge(diffex_df, how='right')

In [7]:
diffex_df.head()

Unnamed: 0,entrez_gene_id,symbol,acronym,patients,tumor_mean,normal_mean,mean_diff,t_stat,mlog10_p_value
0,1.0,A1BG,BLCA,19,5.327895,4.965789,0.362105,1.062114,0.519667
1,1.0,A1BG,BRCA,114,6.743158,5.939912,0.803246,6.667842,9.001291
2,1.0,A1BG,CESC,3,5.583333,7.383333,-1.8,-2.539918,0.898587
3,1.0,A1BG,CHOL,9,7.305556,16.444444,-9.138889,-10.160465,5.122856
4,1.0,A1BG,COAD,26,4.52,5.511538,-0.991538,-4.701665,4.092945


In [8]:
path = os.path.join('data', 'complete', 'differential-expression.tsv.bz2')
diffex_df.to_csv(path, sep='\t', index=False, compression='bz2', float_format='%.4g')