# 2.0 CCLE Gene Expression Signature

In [1]:
from clustergrammer2 import net

clustergrammer2 backend version 0.2.9


In [2]:
import clustergrammer_groupby as cby
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
df = pd.read_csv('../data/CCLE/CCLE.txt.gz', compression='gzip', index_col=0)

In [53]:
net.load_df(df)
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df_tmp = net.export_df().round(2)
genes_top_var = df_tmp.index.tolist()

In [28]:
len(genes_top_var)

1000

In [33]:
net.load_df(df)
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df_tmp = net.export_df().round(2)
net.enrichrgram('GO_Biological_Process_2018')
df_enr = net.export_df()
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "KRT19", "ini": 1000, "clust": 967, "rank": 241, "rankvar": 258…

In [75]:
df_enr.index.tolist()[0]

('KRT19',
 'extracellular matrix organization (GO:0030198): False<p> Pval 9.171026638093131e-29</p>',
 'collagen fibril organization (GO:0030199): False<p> Pval 1.9897240254393085e-12</p>',
 'regulation of cell migration (GO:0030334): False<p> Pval 6.6999547584764606e-15</p>',
 'positive regulation of intracellular signal transduction (GO:1902533): False<p> Pval 5.5852790328680665e-12</p>',
 'negative regulation of cellular process (GO:0048523): False<p> Pval 1.205623445289199e-13</p>',
 'negative regulation of cell proliferation (GO:0008285): False<p> Pval 2.3244136779160615e-10</p>',
 'neutrophil mediated immunity (GO:0002446): False<p> Pval 7.30336780471453e-10</p>',
 'cytokine-mediated signaling pathway (GO:0019221): False<p> Pval 1.2455747717025743e-13</p>',
 'neutrophil degranulation (GO:0043312): False<p> Pval 2.8513592127176324e-09</p>',
 'regulation of angiogenesis (GO:0045765): False<p> Pval 2.3387050175471637e-10</p>')

In [47]:
enrichrgram_row = df_enr.index.tolist()[0][1:]
enrichrgram_row = [x.split(': ')[0] + ' '+  x.split('Pval')[1] for x in enrichrgram_row]
enrichrgram_row

['extracellular matrix organization (GO:0030198)  9.171026638093131e-29</p>',
 'collagen fibril organization (GO:0030199)  1.9897240254393085e-12</p>',
 'regulation of cell migration (GO:0030334)  6.6999547584764606e-15</p>',
 'positive regulation of intracellular signal transduction (GO:1902533)  5.5852790328680665e-12</p>',
 'negative regulation of cellular process (GO:0048523)  1.205623445289199e-13</p>',
 'negative regulation of cell proliferation (GO:0008285)  2.3244136779160615e-10</p>',
 'neutrophil mediated immunity (GO:0002446)  7.30336780471453e-10</p>',
 'cytokine-mediated signaling pathway (GO:0019221)  1.2455747717025743e-13</p>',
 'neutrophil degranulation (GO:0043312)  2.8513592127176324e-09</p>',
 'regulation of angiogenesis (GO:0045765)  2.3387050175471637e-10</p>']

In [5]:
df.shape

(18874, 1037)

In [6]:
from ast import literal_eval as make_tuple
cols = df.columns.tolist()
new_cols = [make_tuple(x) for x in cols]
df.columns = new_cols

In [7]:
all_genes = df.index.tolist()
len(all_genes)

18874

# CCLE Gene Expression Data, Z-score Genes

In [14]:
rows = df.index.tolist()

In [19]:
rows[0]

'LOC100009676'

In [None]:
rows = df.index.tolist()
new_rows = [(x,) for x in rows]
df.index = new_rows
net.load_df(df)
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df_tmp = net.export_df().round(2)
net.load_df(df_tmp)
net.cluster()
net.dendro_cats(axis='row', dendro_level=5)
df_dendro = net.export_df()
net.widget()

In [None]:
rows = df_dendro.index.tolist()
rows[0]

In [None]:
gene_list = sorted([x[0] for x in rows if x[1] == 'Group 5: cat-5'])
len(gene_list)

In [8]:
gmts = {}
gmts['kea'] = net.load_gmt('../data/Enrichr_Libraries_of_Interest/KEA_2015.txt')
gmts['go-process'] = net.load_gmt('../data/Enrichr_Libraries_of_Interest/GO_Biological_Process_2018.txt')

In [9]:
len(all_genes)

18874

In [11]:
# len(set(all_genes).intersection(gene_list))

In [12]:
from scipy.stats import binom_test
def enrich_gene_list_using_lib(lib_json, gene_list, background_list, pval_cutoff=0.05):
    len_gene_list = len(gene_list)
    
    list_terms = []
    list_pval = []
    # list of series that will be used to make dataframe
    list_term_ser = []
    for inst_term in lib_json:
        
        term_ser = pd.Series(data=np.zeros(len(gene_list)), index=gene_list)
        
        term_genes = lib_json[inst_term]
        
        p_expect = len(set(all_genes).intersection(term_genes))/len(all_genes)
        
        found_genes = list(set(gene_list).intersection(term_genes))
        actual_k = len(found_genes)
        
        # set found genes to one
        term_ser[found_genes] =  1
        
        if actual_k/len_gene_list > p_expect:
            inst_pval = binom_test(actual_k, len_gene_list, p_expect)
        else:
            inst_pval = 0.5
        
        if inst_pval < pval_cutoff:

            term_name = (inst_term, 'Pval: ' + str(inst_pval))
            
            list_terms.append(term_name)
            list_pval.append(inst_pval)

            term_ser.name = term_name

            list_term_ser.append(term_ser)
            
    ser_pval = pd.Series(data=list_pval, index=list_terms).sort_values()
    df_term = pd.concat(list_term_ser, axis=1)
    
    # remove genes (rows) with no matches
    gene_sum = df_term.sum(axis=1)
    keep_genes = gene_sum[gene_sum > 0].index.tolist()
    df_term = df_term.loc[keep_genes]
    
    return ser_pval, df_term

In [77]:
def enrich_dataframe_using_lib(lib_json, df_ini, background_list, num_top_terms, pval_cutoff=0.05):

    gene_list = df_ini.index.tolist()
    
    ser_pval, df_term = enrich_gene_list_using_lib(lib_json, gene_list, background_list, 
                                                   pval_cutoff=pval_cutoff)
    
    keep_terms = ser_pval.index.tolist()[:num_top_terms]
    
    ser_pval_keep = ser_pval[keep_terms]
    
    print(ser_pval_keep.shape)
    
    df_term = df_term[keep_terms]
    
    # drop pval, going to use the gmt json to add categories to rows
    keep_term_names = [x[0] for x in keep_terms]
    
    rows_ini = df_ini.index.tolist()
    
    for inst_gene in rows_ini:
        new_row = (inst_gene,)
    
        for inst_term in keep_term_names:
            print(inst_term, len(lib_json[inst_term]), )
    
    return df_term

In [None]:
df_term = enrich_dataframe_using_lib(gmts['go-process'], df_tmp, all_genes, 10)

In [76]:
df_enr.index.tolist()[0]

('KRT19',
 'extracellular matrix organization (GO:0030198): False<p> Pval 9.171026638093131e-29</p>',
 'collagen fibril organization (GO:0030199): False<p> Pval 1.9897240254393085e-12</p>',
 'regulation of cell migration (GO:0030334): False<p> Pval 6.6999547584764606e-15</p>',
 'positive regulation of intracellular signal transduction (GO:1902533): False<p> Pval 5.5852790328680665e-12</p>',
 'negative regulation of cellular process (GO:0048523): False<p> Pval 1.205623445289199e-13</p>',
 'negative regulation of cell proliferation (GO:0008285): False<p> Pval 2.3244136779160615e-10</p>',
 'neutrophil mediated immunity (GO:0002446): False<p> Pval 7.30336780471453e-10</p>',
 'cytokine-mediated signaling pathway (GO:0019221): False<p> Pval 1.2455747717025743e-13</p>',
 'neutrophil degranulation (GO:0043312): False<p> Pval 2.8513592127176324e-09</p>',
 'regulation of angiogenesis (GO:0045765): False<p> Pval 2.3387050175471637e-10</p>')

In [64]:
df_term.shape

(700, 10)

In [17]:
genes_1k = [x[0] for x in rows]

# KEA

In [None]:
ser_pval, df_term = enrich_gene_list_using_lib(gmts['kea'], genes_1k, all_genes)

In [None]:
keep_top_enr = ser_pval.index.tolist()[:10]

In [None]:
net.load_df(df_term[keep_top_enr])
net.widget()

# GO Biological Process

In [22]:
rows[:10]

['LOC100009676',
 'AKT3',
 'MED6',
 'NR2E3',
 'NAALAD2',
 'CDKN2B-AS1',
 'LOC100049716',
 'NAALADL1',
 'ACOT8',
 'ABI1']

In [25]:
len(all_genes)

18874

In [29]:
len(genes_top_var)

1000

In [31]:
ser_pval, df_term = enrich_gene_list_using_lib(gmts['go-process'], genes_top_var, all_genes, 0.01)
df_term.shape

(672, 483)

#### Terms from Enrichrgram

In [48]:
enrichrgram_row

['extracellular matrix organization (GO:0030198)  9.171026638093131e-29</p>',
 'collagen fibril organization (GO:0030199)  1.9897240254393085e-12</p>',
 'regulation of cell migration (GO:0030334)  6.6999547584764606e-15</p>',
 'positive regulation of intracellular signal transduction (GO:1902533)  5.5852790328680665e-12</p>',
 'negative regulation of cellular process (GO:0048523)  1.205623445289199e-13</p>',
 'negative regulation of cell proliferation (GO:0008285)  2.3244136779160615e-10</p>',
 'neutrophil mediated immunity (GO:0002446)  7.30336780471453e-10</p>',
 'cytokine-mediated signaling pathway (GO:0019221)  1.2455747717025743e-13</p>',
 'neutrophil degranulation (GO:0043312)  2.8513592127176324e-09</p>',
 'regulation of angiogenesis (GO:0045765)  2.3387050175471637e-10</p>']

#### Locally Enriched Terms

In [50]:
keep_top_enr = ser_pval.index.tolist()[:10]
keep_top_enr = [x[0] + ' ' + x[1] for x in keep_top_enr]
keep_top_enr

['extracellular matrix organization (GO:0030198) Pval: 4.88614872976e-26',
 'cytokine-mediated signaling pathway (GO:0019221) Pval: 3.32810552013e-15',
 'regulation of cell migration (GO:0030334) Pval: 1.5209499587e-14',
 'regulation of cell proliferation (GO:0042127) Pval: 7.23143833268e-14',
 'cellular response to cytokine stimulus (GO:0071345) Pval: 1.19559734474e-13',
 'negative regulation of cellular process (GO:0048523) Pval: 3.10549303206e-13',
 'positive regulation of cell proliferation (GO:0008284) Pval: 3.78251598995e-11',
 'collagen fibril organization (GO:0030199) Pval: 3.95949893332e-11',
 'positive regulation of intracellular signal transduction (GO:1902533) Pval: 6.85898688745e-11',
 'skin development (GO:0043588) Pval: 1.24056102141e-10']

In [None]:
net.load_df(df_term[keep_top_enr])
net.widget()

In [None]:
len(gene_list)

In [None]:
df_sig, keep_genes_dict, df_gene_pval, fold_info = cby.generate_signatures(df,
                                                                     'tissue', num_top_dims=100)

net.load_df(df_sig)
net.widget()

### Add tissue category to genes

In [None]:
gene_sig = df_sig.idxmax(axis=1)
gs_dict = {}
for inst_gene in gene_sig.index.tolist():
    gs_dict[inst_gene] = gene_sig[inst_gene][0]
    
rows = df.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df.index = new_rows
net.load_df(df)

In [None]:
ct_color = net.viz['cat_colors']['col']['cat-0']

In [None]:
def set_cat_colors(axis, cat_index, cat_title=False):
    for inst_ct in ct_color:
        if cat_title != False:
            cat_name = cat_title + ': ' + inst_ct
        else:
            cat_name = inst_ct
            
        inst_color = ct_color[inst_ct]
        net.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color)

In [None]:
set_cat_colors('row', 1)

### CCLE Data with Gene-Tissue Category

In [None]:
net.load_df(df)
net.widget()