In [330]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
import seaborn as sns
from adjustText import adjust_text
from statsmodels.stats.multitest import fdrcorrection, multipletests
import matplotlib.colors as mcolors
from gseapy import barplot, dotplot
import gseapy as gp

In [331]:
model = '/Users/connormullins/Excel sheets/Model.csv'
osm = '/Users/connormullins/Excel sheets/OmicsSomaticMutations.csv'
effect = '/Users/connormullins/Excel sheets/CRISPRGeneEffect.csv'

In [None]:
model_df = pd.read_csv(model, header=0, index_col=0)
osm_df = pd.read_csv(osm, header=0, low_memory=False)
effect_df = pd.read_csv(effect, header=0, index_col=0)

In [None]:
cancer_type = "Non-Small Cell Lung Cancer"
protein = "KRAS"
gene_change = "p.G12V"

In [None]:
#filter to get only mutated genes
cancer_filter = model_df[model_df['OncotreePrimaryDisease'] == cancer_type]
osm_filter_pie = cancer_filter.merge(osm_df, on=['ModelID'], how='inner')
protein_filter = osm_filter_pie[(osm_filter_pie['HugoSymbol'] == protein)]
protein_change_filter = protein_filter[(protein_filter['ProteinChange']== gene_change)]
osm_filter_pie.set_index('ModelID')
protein_change_filter = protein_change_filter.filter(['ModelID'])

In [None]:
#retrieve wt cell lines-- have cancer but not gene mutation specified
# concat ModelIDs of protein_filter and osm_df-- drop duplicates of osm_df and keep first, then drop duplicates with protein_filter
osm_df1 = osm_df.filter(['ModelID'])
osm_df2 = osm_df1.drop_duplicates(keep = 'first')
protein_filter = protein_filter.filter(['ModelID'])
wt_and_mutant = pd.concat([protein_filter, osm_df2])
wt_gene = wt_and_mutant.drop_duplicates(keep = False)

In [None]:
effect_mutant = protein_change_filter.merge(effect_df, on = ['ModelID'], how = 'inner')
effect_mutant = effect_mutant.set_index('ModelID')

effect_wt = wt_gene.merge(effect_df, on = ['ModelID'], how = 'inner')
effect_wt = effect_wt.set_index('ModelID')

In [None]:
_, p_value = scipy.stats.ttest_ind(effect_mutant, effect_wt)

In [None]:
pvalue_df = pd.DataFrame(p_value)
pvalue_df = pvalue_df.set_index(effect_df.columns)
pvalue_df = pvalue_df.rename(columns = {0 : 'p-value'})
#use mean gene effect to rank
pvalue_df['mean gene effect'] = effect_mutant.mean(axis = 0)

In [None]:
sig_pv = pvalue_df[pvalue_df['p-value'] < 0.05]
sig_pv = sig_pv.reset_index()
sig_pv = sig_pv.sort_values('mean gene effect', ascending=False)
sig_pv['gene name'] = sig_pv['index'].apply(lambda x: x.split(' ')[0])
# sig_pv['gene name'] = sig_pv['index'].str.extract(r'\((\d+)\)')  #if using gene id 


In [None]:
rnk = pd.DataFrame()
rnk['gene'] = sig_pv['gene name']
rnk['gene'] = rnk['gene'].str.upper()
#rnk['score'] = sig_pv['mean gene effect']

wt_stuff = pd.DataFrame()
wt_stuff['scores'] = effect_wt.mean(axis = 0, skipna=True, numeric_only=True)
wt_stuff = wt_stuff.set_index(effect_df.columns)
wt_stuff = wt_stuff.reset_index()
wt_stuff = wt_stuff.sort_values('scores', ascending = False)

wt_stuff['index'] = wt_stuff['index'].apply(lambda x: x.split(' ')[0])
wt_rnk = pd.DataFrame()
wt_rnk['gene'] = wt_stuff['index']


In [None]:
gmt = gp.read_gmt(path = '/Users/connormullins/Downloads/c2.cp.pid.v2024.1.Hs.symbols.gmt')

In [None]:
enrichr = gp.enrichr(gene_list=rnk,
                 gene_sets=gmt,
                 outdir=None,
                )

In [None]:
enr_res = enrichr.results.sort_values('Combined Score', ascending = False)

In [None]:
gsea_mut = pd.DataFrame(enr_res)

In [None]:
mut_pathways_and_scores = pd.DataFrame()

for term in gsea_mut:
    mut_pathways_and_scores['pathway'] = gsea_mut['Term']
    mut_pathways_and_scores['MUT Score'] = gsea_mut['Combined Score']

In [None]:
enrichr_wt = gp.enrichr(gene_list=wt_rnk,
                 gene_sets=gmt,
                 outdir=None,
                )

In [None]:
gsea_wt = enrichr_wt.results.sort_values('Combined Score', ascending = False)

In [None]:
wt_pathways_and_scores = pd.DataFrame()

for term in gsea_wt:
    wt_pathways_and_scores['pathway'] = gsea_wt['Term']
    wt_pathways_and_scores['WT Score'] = gsea_wt['Combined Score']

In [None]:
enriched_shared = pd.DataFrame()
enriched_shared = mut_pathways_and_scores.merge(wt_pathways_and_scores, how = 'inner', on = 'pathway')
enriched_shared['fold change'] = enriched_shared['MUT Score'] / enriched_shared['WT Score']

In [None]:
enriched_shared = enriched_shared[enriched_shared['MUT Score'] > 1]
enriched_shared = enriched_shared.sort_values('fold change', ascending=False).head(4)
enriched_shared

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(enriched_shared['fold change'], labels=enriched_shared['pathway'], autopct='%1.1f%%', startangle=90)
plt.show()