# Analyze generic genes and pathways

This notebook uses the statistics obtained from the [previous notebook](3_statistical_analyses.ipynb) to 
1. Determine if our simulation approach can identify a set of generic genes and pathways
2. Compare our set of generic genes and pathways with what has been previously reported

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import random
import warnings
import rpy2.robjects

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

sys.path.append("../")
from functions import utils

from numpy.random import seed
randomState = 123
seed(randomState)

In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../"))

config_file = os.path.abspath(os.path.join(base_dir,
                                           "Rank_pathways",
                                           "init_config.tsv"))
params = utils.read_config(config_file)

In [3]:
# Load params
local_dir = params["local_dir"]

In [4]:
# Input files
gene_summary_file = os.path.join(
    local_dir, 
    "gene_summary_table.tsv")

pathway_summary_file = os.path.join(
    local_dir, 
    "pathway_summary_table.tsv")

## Generic genes

In [8]:
%%R
suppressWarnings(library("biomaRt"))

In [9]:
%%R -i gene_summary_file -o gene_id_mapping
# Convert gene ids from ensembl (ours) to entrez (DE_prior)

source('../functions/GSEA_analysis.R')

gene_id_mapping <- get_ensembl_symbol_mapping(gene_summary_file)


  res = PandasDataFrame.from_items(items)


In [10]:
# Set ensembl id as index
gene_id_mapping.set_index("ensembl_gene_id", inplace=True)
print(gene_id_mapping.shape)
gene_id_mapping.head()

(57210, 1)


Unnamed: 0_level_0,hgnc_symbol
ensembl_gene_id,Unnamed: 1_level_1
ENSG00000002330,BAD
ENSG00000003137,CYP26B1
ENSG00000003249,DBNDD1
ENSG00000004799,PDK4
ENSG00000006062,MAP3K14


In [12]:
# Replace ensembl ids with gene symbols
utils.replace_ensembl_ids(gene_summary_file,
                          gene_id_mapping)

In [13]:
# Read data
gene_stats = pd.read_csv(
    gene_summary_file,
    header=0,
    sep='\t',
    index_col=0)

gene_stats.head()

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),Test statistic (Real),Median adj p-value (simulated),Median rank (simulated),Mean test statistic (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
S100A9,ENSG00000163220.10,0.0,1.0,13379.93567,0.04697,24.0,484.18776,1022.8298,25,12.60791
S100A7,ENSG00000143556.8,0.0,2.0,6073.87558,0.12384,137.0,30.90258,44.86071,25,134.70525
S100A8,ENSG00000143546.9,0.0,3.0,5372.06645,0.09404,34.0,252.8497,614.24567,25,8.33415
PI3,ENSG00000124102.4,0.00131,4.0,2808.47514,0.07173,159.0,29.4459,27.65667,25,100.48316
KRT6A,ENSG00000205420.10,0.00023,5.0,2673.36594,0.04417,170.0,38.35617,46.3869,25,56.80504


In [14]:
# Define what are the set of generic genes
generic_genes_data = gene_stats.sort_values(by="Z score", ascending=True)[0:10]

generic_genes_data.head()

Unnamed: 0,Gene ID,Adj P-value (Real),Rank (Real),Test statistic (Real),Median adj p-value (simulated),Median rank (simulated),Mean test statistic (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
MT4,ENSG00000102891.3,0.02441,57657.0,-26.42172,0.00711,44018.0,-0.11481,0.11445,25,-229.86077
C5orf46,ENSG00000178776.4,0.0,57758.0,-39.2722,0.00581,43319.5,-0.12854,0.17051,25,-229.56707
IL37,ENSG00000125571.9,0.0,57879.0,-66.0856,0.00282,49098.0,-0.27935,0.34162,25,-192.6315
ELOVL3,ENSG00000119915.4,0.01037,57724.0,-33.87363,0.06511,47300.0,-0.15337,0.18205,25,-185.2256
LCE5A,ENSG00000186207.4,0.0,57933.0,-106.54381,0.00107,52257.0,-0.4833,0.57883,25,-183.23107


In [15]:
# Get list of generic genes
generic_genes = list(generic_genes_data.index)

In [30]:
# Get generic genes identified by Crow et. al.
# https://www.pnas.org/content/pnas/116/13/6491.full.pdf
DE_prior_file = "https://raw.githubusercontent.com/maggiecrow/DEprior/master/DE_Prior.txt"

DE_prior = pd.read_csv(DE_prior_file,
                       header=0,
                       sep="\t")

DE_prior.head()

Unnamed: 0,Gene_Order,Gene_EntrezID,N_HitLists,DE_Prior_Rank,Gene_Name
0,1,7503,79,1.0,XIST
1,2,8653,64,0.999948,DDX3Y
2,3,9086,62,0.99987,EIF1AY
3,4,8284,52,0.99987,KDM5D
4,5,8287,46,0.999791,USP9Y


In [38]:
# Get list of published generic genes
published_generic_genes = list(DE_prior['Gene_Name'])

In [45]:
# What is the percent of our genes that intersects with those previously reported?
print(set(published_generic_genes).intersection(generic_genes))
len(set(published_generic_genes).intersection(generic_genes))/len(generic_genes)

{'IL37', 'CHP2', 'ELOVL3', 'SERPINA12', 'C5orf46', 'MT4', 'BTC'}


0.7

## Generic pathways

In [46]:
# Read data
pathway_stats = pd.read_csv(
    pathway_summary_file,
    header=0,
    sep='\t',
    index_col=0)

pathway_stats.head()

Unnamed: 0_level_0,Pathway,P-value (Real),Rank (Real),Test statistic (Real),Median p-value (simulated),Median rank (simulated),Mean test statistic (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HALLMARK_ADIPOGENESIS,HALLMARK_ADIPOGENESIS,,,,0.001,8.0,0.62604,0.09741,5,
HALLMARK_ALLOGRAFT_REJECTION,HALLMARK_ALLOGRAFT_REJECTION,,,,0.0045,8.0,0.6415,0.07065,4,
HALLMARK_ANDROGEN_RESPONSE,HALLMARK_ANDROGEN_RESPONSE,,,,0.01399,24.0,0.73211,,1,
HALLMARK_APICAL_JUNCTION,HALLMARK_APICAL_JUNCTION,,,,0.002,25.0,0.72989,,1,
HALLMARK_APICAL_SURFACE,HALLMARK_APICAL_SURFACE,,,,0.02048,2.5,0.69557,0.11269,2,


In [47]:
# Define what are the set of generic genes
generic_pathway_data = pathway_stats.sort_values(by="Z score", ascending=True)[0:10]

generic_pathway_data.head()

Unnamed: 0_level_0,Pathway,P-value (Real),Rank (Real),Test statistic (Real),Median p-value (simulated),Median rank (simulated),Mean test statistic (simulated),Std deviation (simulated),Number of experiments (simulated),Z score
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HALLMARK_ADIPOGENESIS,HALLMARK_ADIPOGENESIS,,,,0.001,8.0,0.62604,0.09741,5,
HALLMARK_ALLOGRAFT_REJECTION,HALLMARK_ALLOGRAFT_REJECTION,,,,0.0045,8.0,0.6415,0.07065,4,
HALLMARK_ANDROGEN_RESPONSE,HALLMARK_ANDROGEN_RESPONSE,,,,0.01399,24.0,0.73211,,1,
HALLMARK_APICAL_JUNCTION,HALLMARK_APICAL_JUNCTION,,,,0.002,25.0,0.72989,,1,
HALLMARK_APICAL_SURFACE,HALLMARK_APICAL_SURFACE,,,,0.02048,2.5,0.69557,0.11269,2,


In [None]:
# Manually compare against Powers et. al publication 
# https://academic.oup.com/bioinformatics/article/34/13/i555/5045793