# Enrichment analysys
### Over representation analysis
* Simplified ilustrative example
* Followed by real case
### GSEA
* Practical using python package

# Supervised machine learning
### Linear regression
### Logistic regression
### SVM
### Overfitting and Bias vs Variance -> Cross validation
### Dimentionality of the input -> PCA

# Unsupervised machine learning
### K-means clustering
* Using TCGA data, HR+/-

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import gseapy as gp
import qvalue

from ipywidgets import interact, interact_manual
from ipywidgets import IntSlider, FloatSlider, Dropdown, Text

interact_enrich=interact_manual.options(manual_name="Enrichment analysis")
interact_plot=interact_manual.options(manual_name="Plot")
interact_calc=interact_manual.options(manual_name="Calculate tests")

In [2]:
clinical_data = pd.read_csv('data/brca_clin.tsv.gz', sep ='\t', index_col=2)
clinical_data = clinical_data.iloc[4:,1:]
expression_data = pd.read_csv('data/brca.tsv.gz', sep ='\t', index_col=1)
expression_data = expression_data.iloc[:,2:].T

In [3]:
def differential_test(clinical_df, expression_df, separator, cond1, cond2):
    p_vals = pd.DataFrame(columns = ['p'])
    for gene in expression_df.columns:
        try:
            expression = expression_df[gene]
        except:
            print('Gene not found in data')
        try:
            group1 = clinical_df[separator] == cond1
            index1 = clinical_df[group1].index
            group2 = clinical_df[separator] == cond2
            index2 = clinical_df[group2].index
        except:
            print('Clinical condition wrong')
        expression1 = expression[index1].dropna()
        expression2 = expression[index2].dropna()
        p_val = sp.stats.ttest_ind(expression1, expression2).pvalue
        if p_val == p_val:
            p_vals.loc[gene,'p'] = p_val
    return p_vals

def plot_hist(stats, bins):
    stats = np.array(stats)
    plt.hist(stats, bins = bins)
    plt.show()


def interact_multiple_gene_ttest(Criteria, Group_1, Group_2):
    global BRCA_tests
    BRCA_tests = differential_test(clinical_data, expression_data, Criteria, Group_1, Group_2)
    BRCA_tests = qvalue.qvalues(BRCA_tests)
    plot_hist(BRCA_tests['p'].values, 20)
    with pd.option_context('display.max_rows', None):
        display(BRCA_tests)
        
def ORA(tests, threshold, pathway_db=['KEGG_2019_Human'], stat = 'p'):
    background=set(tests.index)
    gene_list = list(tests.loc[tests[stat]<threshold,stat].index)

    output_enrichr=pd.DataFrame()
    enr=gp.enrichr(
                    gene_list=gene_list,
                    gene_sets=pathway_db,
                    background=background,
                    outdir = None
                )
    results = enr.results[["P-value","Overlap","Term"]].rename(columns={"P-value": "p"})
    return qvalue.qvalues(results)

pathway_db_choice = gp.get_library_name()

        
def interact_ORA(threshold, pathway_db, stat):
    threshold = float(threshold)
    results = ORA(BRCA_tests, threshold, pathway_db, stat = stat)
    with pd.option_context('display.max_rows', None):
        display(results)

In [4]:
interact_calc(interact_multiple_gene_ttest, Criteria=Text('Surgical procedure first'), Group_1 = Text('Simple Mastectomy'), Group_2=Text('Lumpectomy'))
interact_enrich(interact_ORA, threshold = '5e-2' , pathway_db = pathway_db_choice, stat=['p','q'])

interactive(children=(Text(value='Surgical procedure first', description='Criteria'), Text(value='Simple Maste…

interactive(children=(Text(value='5e-2', description='threshold'), Dropdown(description='pathway_db', options=…

<function __main__.interact_ORA(threshold, pathway_db, stat)>

In [5]:
def gsea(tests, pathway_db = 'KEGG_2019_Human' ):
    pre_res = gp.prerank(rnk=tests['p'], 
                    gene_sets=pathway_db,
                    processes=4,
                    permutation_num=100, # reduce number to speed up testing
                    outdir='test',  # do not write output to disk
                    no_plot=True, # Skip plotting
                    format='png')
    return pre_res

In [6]:
a = gsea(BRCA_tests)

In [9]:
a.res2d.head()

Unnamed: 0_level_0,es,nes,pval,fdr,geneset_size,matched_size,genes,ledge_genes
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Steroid biosynthesis,0.542969,1.453417,0.03,0.35896,19,19,EBP;SQLE;CYP24A1;DHCR7;TM7SF2;NSDHL;DHCR24;MSM...,EBP;SQLE;CYP24A1;DHCR7;TM7SF2;NSDHL;DHCR24;MSM...
Pyrimidine metabolism,0.460029,1.362757,0.0,0.494567,57,56,DTYMK;UCK2;RRM2B;ENPP1;NT5M;DPYS;DHODH;NT5C3B;...,DTYMK;UCK2;RRM2B;ENPP1;NT5M;DPYS;DHODH;NT5C3B;...
Sphingolipid metabolism,0.475377,1.346579,0.01,0.536779,47,44,SMPD1;GAL3ST1;CERS5;GBA;SMPD3;UGCG;SPTLC3;CERS...,SMPD1;GAL3ST1;CERS5;GBA;SMPD3;UGCG;SPTLC3;CERS...
Endocrine and other factor-regulated calcium reabsorption,0.484059,1.372321,0.0,0.565861,48,48,GNAS;KLK1;VDR;CLTCL1;ATP1B4;BDKRB2;ADCY9;AP2A1...,GNAS;KLK1;VDR;CLTCL1;ATP1B4;BDKRB2;ADCY9;AP2A1...
Glycosylphosphatidylinositol (GPI)-anchor biosynthesis,0.491757,1.322065,0.06,0.639576,25,25,PIGO;PIGQ;GPLD1;PIGF;PIGL;PIGV;PIGH;PIGZ;PIGC;...,PIGO;PIGQ;GPLD1;PIGF;PIGL;PIGV;PIGH;PIGZ;PIGC;...


# References

Weinstein, John N., et al. 'The cancer genome atlas pan-cancer analysis project.' Nature genetics 45.10 (2013): 1113-1120.

Patrício, Miguel, et al. “Using Resistin, Glucose, Age and BMI to Predict the Presence of Breast Cancer.” BMC Cancer, vol. 18, no. 1, Jan. 2018, p. 29. BioMed Central, doi:10.1186/s12885-017-3877-1.