In [8]:
import pandas as pd
import mygene

In [15]:
all_names = pd.read_csv('/home/gchan/ermineJ.data/risk_gene_counts.txt', sep='\t', dtype={'ElementName': str, 'abagen name': str, 'gene name': str, 'counts': int, 'max_corr': float})

# convert gene names to Ensembl IDs
mg = mygene.MyGeneInfo()
ensembl_genes = mg.querymany(all_names['ElementName'].values.tolist(), scopes='entrezgene', fields='ensembl.gene', species='human', as_dataframe=True)
ensembl_genes['ensembl'] = ensembl_genes.apply(lambda row: row['ensembl.gene'] if pd.notna(row['ensembl.gene']) else (row['ensembl'][0]['gene'] if isinstance(row['ensembl'], list) and len(row['ensembl']) > 0 else None), axis=1)

# Merge the ensembl_genes dataframe with all_names dataframe
all_names = all_names.merge(ensembl_genes[['ensembl']], left_on='ElementName', right_index=True, how='inner')
all_names = all_names.drop_duplicates(subset=['ensembl'], keep='first').reset_index(drop=True)

# Filter risk names
risk_names = all_names[all_names['counts'] > 0].reset_index(drop=True)
display(all_names)


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
7 input query terms found dup hits:	[('284697', 2), ('8335', 2), ('8969', 3), ('8346', 5), ('23499', 2), ('10200', 2), ('25937', 2)]


Unnamed: 0,ElementName,abagen name,gene name,counts,max_corr,ensembl
0,1,A1BG,A1BG,3,0.560967,ENSG00000121410
1,2,A2M,A2M,0,0.438845,ENSG00000175899
2,144568,A2ML1,A2ML1,0,0.437742,ENSG00000166535
3,127550,A3GALT2,A3GALT2,0,0.454256,ENSG00000184389
4,53947,A4GALT,A4GALT,0,0.441503,ENSG00000128274
...,...,...,...,...,...,...
9931,79364,ZXDC,ZXDC,1,0.515856,ENSG00000070476
9932,79699,ZYG11B,ZYG11B,0,0.492594,ENSG00000162378
9933,7791,ZYX,ZYX,0,0.470202,ENSG00000285443
9934,23140,ZZEF1,ZZEF1,0,0.390854,ENSG00000074755


In [18]:
eqtls = pd.read_csv('/fs03/kg98/gchan/SIR_SCZ/SIR_utils/enrichment/pgcSCZ3_eQTLbrain_genes.txt', sep='\s+')
eqtls = eqtls.rename(columns={"GENE": "Ensembl.ID"})
eqtls = eqtls[eqtls['P'] < (0.05/eqtls.shape[0])].reset_index(drop=True)
# eqtls = pd.read_csv('/home/gchan/kg98_scratch/gchan/SIR_SCZ/SIR_utils/enrichment/41586_2022_4434_Tab1_ESM.csv')
eqtls = eqtls[eqtls['Ensembl.ID'].isin(all_names['ensembl'])].reset_index(drop=True)
display(eqtls.shape)

(220, 11)

In [19]:
import numpy as np
from tqdm import tqdm

# Step 1: Calculate the Enrichment Score (ES)
def calculate_es(gene_list, gene_set):
    N = len(gene_list)
    Nh = len(gene_set)
    hits = np.isin(gene_list, list(gene_set)).astype(int)
    no_hits = 1 - hits

    # Calculate the running sum
    running_sum = np.cumsum(hits / Nh - no_hits / (N - Nh))
    es = running_sum.max()
    return es

# Step 2: Estimate the Statistical Significance (Nominal P value)
def permute_and_calculate_es(gene_list, gene_set, n_permutations=1000, seed=None):
    if seed is not None:
        np.random.seed(seed)
    permuted_es = np.zeros(n_permutations)
    for i in tqdm(range(n_permutations)):
        permuted_list = np.random.permutation(gene_list)
        permuted_es[i] = calculate_es(permuted_list, gene_set)
    return permuted_es

def run_gsea(gene_list, gene_set, n_permutations=1000):
    observed_es = calculate_es(gene_list, gene_set)
    permuted_es = permute_and_calculate_es(gene_list, gene_set, n_permutations)
    p_value = np.mean(permuted_es >= observed_es)

    print(f"Observed ES: {observed_es}")
    print(f"Empirical P value: {p_value}")

    return observed_es, permuted_es


In [20]:
# with eQTLs as gene set, risk gene max corr as gene list
all_names = all_names.sort_values(by='max_corr', ascending=False)
gene_list = all_names['ensembl'].tolist()
gene_set = set(eqtls['Ensembl.ID'].tolist())
print("eQTLs as gene set, risk gene max corr as gene list")
obs, perm = run_gsea(gene_list, gene_set, n_permutations=1000)

# # with eQTLs as gene set, risk counts as gene list
all_names = all_names.sort_values(by='counts', ascending=False)
gene_list = all_names['ensembl'].tolist()
print("eQTLs as gene set, risk counts as gene list")
obs2, perm2 = run_gsea(gene_list, gene_set, n_permutations=1000)


  0%|          | 0/1000 [00:00<?, ?it/s]

eQTLs as gene set, risk gene max corr as gene list


100%|██████████| 1000/1000 [01:05<00:00, 15.27it/s]


Observed ES: 0.08065795875594185
Empirical P value: 0.061


  0%|          | 0/1000 [00:00<?, ?it/s]

eQTLs as gene set, risk counts as gene list


100%|██████████| 1000/1000 [01:11<00:00, 13.91it/s]

Observed ES: 0.05331973502002058
Empirical P value: 0.266





In [21]:
import scipy.stats as stats

# Example data
experimental_hits = set(risk_names['ensembl'].values)
pathway_genes = set(eqtls['Ensembl.ID'].values)
all_genes = set(all_names['ensembl'].values)
non_hits = all_genes - experimental_hits

# Calculate the counts for the contingency table
hits_in_pathway = len(experimental_hits & pathway_genes)
hits_not_in_pathway = len(experimental_hits) - hits_in_pathway
non_hits_in_pathway = len(non_hits & pathway_genes)
non_hits_not_in_pathway = len(non_hits) - non_hits_in_pathway

# Create the contingency table
contingency_table = [
    [hits_in_pathway, hits_not_in_pathway],
    [non_hits_in_pathway, non_hits_not_in_pathway]
]

# Perform Fisher's Exact Test
odds_ratio, p_value = stats.fisher_exact(contingency_table)

print(contingency_table)
print(f"Odds Ratio: {odds_ratio}")
print(f"P-value: {p_value}")

[[59, 2185], [161, 7531]]
Odds Ratio: 1.263069772730503
P-value: 0.14180477550874823


In [None]:
# Extract counts and p-values for the genes in the pathway
pathway_data = eqtls[eqtls['GENE'].isin(all_genes)]
counts = all_names.set_index('ensembl').loc[pathway_data['GENE'], 'max_corr']
p_values = pathway_data['p bonferroi']

# Calculate Spearman's rank correlation
spearman_corr, spearman_p_value = stats.spearmanr(counts, p_values)

print(f"Spearman Correlation: {spearman_corr}")
print(f"Spearman P-value: {spearman_p_value}")

Spearman Correlation: -0.02190030855487883
Spearman P-value: 0.051643007682471195
