## Genes upregulated in colon cancer

In [11]:
import os

import pandas
import numpy
from scipy.stats import ttest_1samp

In [2]:
path = os.path.join('data', 'complete', 'expression-matrix.tsv.bz2')
expr_df = pandas.read_table(path, index_col=0)

In [3]:
path = os.path.join('data', 'complete', 'samples.tsv')
sample_df = (
    pandas.read_table(path)
    # Filter for samples with expression
    .query("sample_id in @expr_df.index")
)
patient_df = sample_df[['patient_id', 'acronym']].drop_duplicates()
sample_df.head(2)

Unnamed: 0,sample_id,patient_id,sample_type,disease,acronym,organ_of_origin,gender,age_diagnosed,dead,days_survived,recurred,days_recurrence_free
35,TCGA-02-0047-01,TCGA-02-0047,Primary Tumor,glioblastoma multiforme,GBM,Brain,Male,78.0,1.0,448.0,,
40,TCGA-02-0055-01,TCGA-02-0055,Primary Tumor,glioblastoma multiforme,GBM,Brain,Female,62.0,1.0,76.0,,


In [4]:
type_df = sample_df.pivot('patient_id', 'sample_type', values='sample_id')
type_df = type_df[['Primary Tumor', 'Solid Tissue Normal']]
# Filter for paired samples
type_df = type_df[type_df.isnull().sum(axis='columns') == 0]
type_df = type_df.reset_index().merge(patient_df)
type_df.head(2)

sample_type,patient_id,Primary Tumor,Solid Tissue Normal,acronym
0,TCGA-22-4593,TCGA-22-4593-01,TCGA-22-4593-11,LUSC
1,TCGA-22-4609,TCGA-22-4609-01,TCGA-22-4609-11,LUSC


In [8]:
def get_diffex(subtype_df):
    """
    For each gene, compute differential expression between paired tumor and normal tissue.
    """
    tumor_df = expr_df.loc[list(subtype_df['Primary Tumor']), :]
    normal_df = expr_df.loc[list(subtype_df['Solid Tissue Normal']), :]
    for df in tumor_df, normal_df:
        df.index = subtype_df.index
    
    diffex_df = tumor_df - normal_df
    
    ttest = ttest_1samp(diffex_df, 0)

    df = pandas.DataFrame.from_items([
        ('entrez_gene_id', diffex_df.columns),
        ('patients', len(diffex_df)),
        ('mean_diff', diffex_df.mean()),
        ('t_stat', ttest.statistic),
        ('mlog10_p_value', -numpy.log10(ttest.pvalue)),
    ])
    return df

diffex_df = type_df.groupby('acronym').apply(get_diffex).reset_index('acronym')

In [9]:
diffex_df.head()

Unnamed: 0,acronym,entrez_gene_id,patients,mean_diff,t_stat,mlog10_p_value
1,BLCA,1,19,0.362105,1.062114,0.519667
2,BLCA,2,19,-2.764737,-10.230441,8.201865
9,BLCA,9,19,0.329474,1.196608,0.607333
10,BLCA,10,19,0.416158,1.472064,0.800594
12,BLCA,12,19,-3.548421,-4.912332,3.949696


In [15]:
path = os.path.join('data', 'complete', 'differential-expression.tsv.bz2')
diffex_df.to_csv(path, sep='\t', index=False, compression='bz2', float_format='%.4g')