In [1]:
import numpy as np
import pandas as pd
import random
from pathlib import Path
from scipy.stats import zscore
import os
from scipy.stats import ranksums
working_dir = "/Users/nkarisan/PycharmProjects/BTC_Cell_Line_Atlas_Manuscript"
os.chdir(working_dir)

import scripts.io_library as io_library
io_library.input_dir = 'input_data/'
io_library.output_dir = 'Remaining_Analyses/output_files/'

from scripts.io_library import MyLib
from scripts.utils import Utils
from scripts.gene_sets_ssgsea import GSEA

In [2]:
import matplotlib
seed = 2023
random.seed(seed)
np.random.seed(seed)

matplotlib.use("nbagg")


%matplotlib inline
%config InlineBackend.figure_format = "retina"


%load_ext autoreload
%autoreload 2

Pathways enriched in R3 compared to R4, and vice versa, were identified using gene set enrichment analysis, as presented in Supplementary Figure 8.

# Load dataset

In [3]:
rna_all_file = 'Main_Data/RNA/rna_cell_lines.csv'
upset_file = 'cell_lines_master_upset_2025-2-24.csv'
maroon_color = '#A00000'
base_color = '#d4d4d4'

In [4]:
exp_df = MyLib.load_csv(rna_all_file, index_col=0)
exp_df = exp_df.T # samples are the rows

raw_upset_df = MyLib.load_csv(upset_file)
raw_upset_df.set_index('Cell_line', inplace=True, drop=True)
upset_df = raw_upset_df[raw_upset_df['RNA']==1]

upset_df = upset_df[["Subtype"]]
upset_df = upset_df.loc[exp_df.index, :]


input_data/Main_Data/RNA/rna_cell_lines.csv  is loaded, shape:  (17649, 79)

input_data/cell_lines_master_upset_2025-2-24.csv  is loaded, shape:  (87, 12)



In [5]:
upset_df

Unnamed: 0,Subtype
CCLP1,ICC
CCSW1,ICC
AOVC1,AC
ECC3,ECC
EGI1,ECC
...,...
TFK1,ECC
TGBC1TKB,GBC
TGBC52TKB,AC
TKKK,ICC


# Gene Sets ssGSEA: R3 vs R4

In [7]:
RNA_upset_file = 'Figure5/output_fig5/RNA_Gene_reordered_upset_df.csv'
RNA_upset_df = pd.read_csv(RNA_upset_file, index_col=0)
r3_ids_l = RNA_upset_df[RNA_upset_df['RNA_Cluster']=='R3'].index.tolist()
r4_ids_l = RNA_upset_df[RNA_upset_df['RNA_Cluster']=='R4'].index.tolist()
print(len(r3_ids_l), len(r4_ids_l))

22 23


In [None]:
gene_sets_ssgsea_file = 'gene_sets_ssgsea_zscore.csv'
if Path(os.path.join(io_library.output_dir, gene_sets_ssgsea_file)).is_file():
    gene_sets_ssgsea_zscore_df = pd.read_csv(os.path.join(io_library.output_dir, gene_sets_ssgsea_file), index_col=0)
    print('gene_sets_ssgsea_zscore.csv is loaded, shape:', gene_sets_ssgsea_zscore_df.shape)
else:
    ## First run gene_sets_ssgsea.py to create 'gene_sets_ssgsea_zscore.csv' (Takes few hours)
    msigdb_dir = 'input_data/Additional_Data/Selected_Lists/msigdb_v2023.2.Hs_GMTs/'
    gene_sets_ssgsea_zscore_df = GSEA.run_gene_sets_ssgsea_analysis(rna_all_file, upset_file, msigdb_dir)
print(gene_sets_ssgsea_zscore_df.shape)
gene_sets_ssgsea_zscore_df.T.head()


Program started at 2025/04/22 23:47:01

input_data/cell_lines_master_upset_2025-2-24.csv  is loaded, shape:  (87, 12)

input_data/Main_Data/RNA/rna_cell_lines.csv  is loaded, shape:  (17649, 79)

input_data/Additional_Data/Selected_Lists/msigdb_v2023.2.Hs_GMTs/c2.all.v2023.2.Hs.symbols.gmt
7233
input_data/Additional_Data/Selected_Lists/msigdb_v2023.2.Hs_GMTs/c5.all.v2023.2.Hs.symbols.gmt
16008
input_data/Additional_Data/Selected_Lists/msigdb_v2023.2.Hs_GMTs/c4.all.v2023.2.Hs.symbols.gmt
1007
input_data/Additional_Data/Selected_Lists/msigdb_v2023.2.Hs_GMTs/c8.all.v2023.2.Hs.symbols.gmt
830
input_data/Additional_Data/Selected_Lists/msigdb_v2023.2.Hs_GMTs/c6.all.v2023.2.Hs.symbols.gmt
189

Total gene sets: 25245
Number of cpus: 10
1264
[1] "Calculating ranks..."
5688
[1] "Calculating ranks..."
[1] "Calculating absolute values from ranks..."
[1] "Calculating absolute values from ranks..."
[1] "Normalizing..."
[1] "Normalizing..."
4424
[1] "Calculating ranks..."
[1] "Calculating absolute v

In [15]:
gene_sets_ssgsea_zscore_df.shape

(56, 25245)

In [11]:
r3_gene_sets_ssgsea_zscore_df = gene_sets_ssgsea_zscore_df.loc[r3_ids_l]
r4_gene_sets_ssgsea_zscore_df = gene_sets_ssgsea_zscore_df.loc[r4_ids_l]

In [13]:
r4_gene_sets_ssgsea_zscore_df

Unnamed: 0_level_0,ABBUD_LIF_SIGNALING_1_DN,ABBUD_LIF_SIGNALING_1_UP,ABBUD_LIF_SIGNALING_2_DN,ABBUD_LIF_SIGNALING_2_UP,ABDELMOHSEN_ELAVL4_TARGETS,ABDULRAHMAN_KIDNEY_CANCER_VHL_DN,ABE_INNER_EAR,ABE_VEGFA_TARGETS,ABE_VEGFA_TARGETS_2HR,ABE_VEGFA_TARGETS_30MIN,...,TBK1.DN.48HRS_DN,TBK1.DN.48HRS_UP,TGFB_UP.V1_DN,TGFB_UP.V1_UP,VEGF_A_UP.V1_DN,VEGF_A_UP.V1_UP,WNT_UP.V1_DN,WNT_UP.V1_UP,YAP1_DN,YAP1_UP
Cell_line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ICC9,-1.055671,-0.959971,-0.959715,-1.528777,-1.766995,-1.162723,-0.061337,1.494974,1.072545,0.962177,...,-1.8634,-2.565998,-2.315821,-0.759019,-0.703572,-1.452274,-0.310199,-0.30986,0.108854,-1.45258
HUCCT1,-1.116497,-0.801913,1.227025,-1.513305,-0.380342,-0.445225,-0.765887,1.607359,-0.351352,0.950891,...,0.629244,-1.427698,-0.08725,-1.022558,1.229868,-1.260291,-0.833773,-0.556581,-1.136982,0.178972
ICC8,0.604952,0.174533,0.311275,-0.385488,-1.565966,-0.809576,-0.571195,1.630972,1.439877,-0.316799,...,-0.401127,-1.409628,0.754456,-0.147045,-0.198405,-0.308803,-0.0591,1.06098,0.750303,0.956239
ICC12,-1.043329,-0.250657,0.570472,-0.917022,-0.319883,-2.211107,0.261571,2.063891,1.767242,1.200361,...,0.361632,-1.976693,-1.401867,0.690269,-0.085698,0.234329,0.773834,0.285866,0.871878,0.093963
EGI1,-0.061561,0.279287,-0.126429,0.19785,-2.480126,-1.305544,1.293772,-1.358384,0.21424,-0.821536,...,-0.133383,-2.094759,0.627492,-0.853384,-2.221267,0.509756,0.962311,0.969339,-0.458061,-1.329489
AOVC1,-0.53192,-0.709072,-1.47856,0.216136,-0.964475,-0.458292,0.737621,0.693266,0.79152,0.640656,...,-0.07141,-0.031816,-0.151301,-0.142134,-0.523277,-0.503552,-0.146975,-1.611922,-0.801813,-0.762515
TGBC1TKB,-1.1049,0.883849,0.45023,-0.499096,-0.007753,-0.538559,-0.081908,1.259358,1.507725,0.390145,...,0.849593,-1.063183,-0.168662,-0.217218,0.011223,-0.055325,0.315758,-0.466302,0.403474,1.369719
YSCCC,-0.583315,0.297255,1.012507,0.472771,-0.143076,0.545085,-0.92696,-0.123335,0.231471,-1.407089,...,-0.667453,0.506408,0.060064,-0.836895,-0.91211,-0.626488,-1.066459,-0.485043,0.623464,1.590242
ICC2,-0.434974,0.535755,0.255612,-0.70345,-1.728758,-0.460847,-0.598017,-1.173071,-0.989619,0.716452,...,1.16611,-0.980855,0.202605,-0.171593,-1.026185,0.278242,1.100571,0.8146,-1.294231,-0.310339
ECC3,-0.976794,1.085445,0.212048,-0.669452,-2.379331,-0.862014,0.732841,1.428901,0.995957,1.429371,...,0.382497,0.0051,1.107317,-0.245272,0.041723,-1.073214,-0.301379,-0.032125,0.883965,-1.251157


In [14]:
Utils.statistical_test_and_volcano_plot(r3_gene_sets_ssgsea_zscore_df, r4_gene_sets_ssgsea_zscore_df, test='ranksum', plot_volcano=False, save_sig=True, title='R3_vs_R4_gene_sets_ssgsea_zscore')


File  Remaining_Analyses/output_files/R3_vs_R4_gene_sets_ssgsea_zscore_down.csv saved, shape: (1230, 7)

File  Remaining_Analyses/output_files/R3_vs_R4_gene_sets_ssgsea_zscore_up.csv saved, shape: (8647, 7)


Unnamed: 0,feature,stat,p-value,effect_size,q-value,in_group_mean,out_group_mean
0,HP_GLUCOCORTOCOID_INSENSITIVE_PRIMARY_HYPERALD...,-4.949860,7.426677e-07,-0.737882,0.000170,-0.638144,0.814816
1,HP_UNDETECTABLE_LIGHT_AND_DARK_ADAPTED_ELECTRO...,-4.859037,1.179580e-06,-0.724342,0.000187,-0.868642,0.592517
2,GOBP_COMMON_MYELOID_PROGENITOR_CELL_PROLIFERATION,-4.722802,2.326167e-06,-0.704034,0.000241,-0.810625,0.660669
3,HAY_BONE_MARROW_CD34_POS_ERP,-4.700097,2.600383e-06,-0.700649,0.000248,-0.603155,0.702208
4,HP_SHOCK,-4.654685,3.244762e-06,-0.693879,0.000269,-0.695386,0.716997
...,...,...,...,...,...,...,...
25240,WP_ANGIOTENSIN_II_RECEPTOR_TYPE_1_PATHWAY,5.472093,4.447506e-08,0.815732,0.000138,0.820023,-0.837814
25241,REACTOME_CHONDROITIN_SULFATE_BIOSYNTHESIS,5.472093,4.447506e-08,0.815732,0.000138,0.823804,-0.808637
25242,REACTOME_RESPONSE_TO_ELEVATED_PLATELET_CYTOSOL...,5.562916,2.653030e-08,0.829271,0.000138,0.887520,-0.797843
25243,HP_HEMATEMESIS,5.562916,2.653030e-08,0.829271,0.000138,1.025298,-0.782906
