In [1]:
import numpy as np
import pandas as pd
import random
import os
working_dir = "/Users/nkarisan/PycharmProjects/BTC_Cell_Line_Atlas_Manuscript"
os.chdir(working_dir)

import scripts.io_library as io_library
io_library.input_dir = 'input_data/'
io_library.output_dir = 'data_preprocessing/output_data/'

from scripts.io_library import MyLib
from scripts.utils import Dataset

In [2]:
import matplotlib
seed = 2023
random.seed(seed)
np.random.seed(seed)

matplotlib.use("nbagg")


%matplotlib inline
%config InlineBackend.figure_format = "retina"


%load_ext autoreload
%autoreload 2

# Gene exclusion list

The output file "all_common_genes.csv" should be placed in "input_data/Additional_Data/Selected_Lists".

In [40]:
common_essentials_df = MyLib.load_csv('Additional_Data/DepMap/AchillesCommonEssentialControls.csv')
none_essentials_df = MyLib.load_csv('Additional_Data/DepMap/AchillesNonessentialControls.csv')
inferred_common_essentials_df = MyLib.load_csv('Additional_Data/DepMap/CRISPRInferredCommonEssentials.csv')
genes_exclude = sorted(set(common_essentials_df.Gene.tolist()).union(set(none_essentials_df.Gene.tolist())).union(set(inferred_common_essentials_df.Essentials.tolist())) - {'KRAS (3845)'}) # exclude KRAS
df = pd.DataFrame(genes_exclude, columns=['Gene'])
MyLib.save_csv(df, 'all_common_genes.csv', index=False)

input_data/Additional_Data/DepMap/AchillesCommonEssentialControls.csv  is loaded, shape:  (1247, 1)

input_data/Additional_Data/DepMap/AchillesNonessentialControls.csv  is loaded, shape:  (781, 1)

input_data/Additional_Data/DepMap/CRISPRInferredCommonEssentials.csv  is loaded, shape:  (1537, 1)


File  data_preprocessing/output_data/all_common_genes.csv saved, shape: (2577, 1)


# Extract BTC and HCC lines from DepMap dataset

At the time of this analysis, not all RNA, mutations, and proteomics data were made publicly available on the DepMap portal. Therefore, we only extracted the publicly available datasets and used them in conjunction with our own to-be-published datasets.

Note that all output files along with newly published data are available in the Main_Data/ folder on Figshare.

In [3]:
raw_datasets_dic = dict()
dir = 'Additional_Data/DepMap/'
raw_datasets_dic['CRISPR'] = MyLib.load_csv(os.path.join(dir, 'CRISPRGeneEffect.csv'), index_col=0)
raw_datasets_dic['Mut_Hot'] = MyLib.load_csv(os.path.join(dir, 'OmicsSomaticMutationsMatrixHotspot.csv'), index_col=0)
raw_datasets_dic['Mut_Dam'] = MyLib.load_csv(os.path.join(dir, 'OmicsSomaticMutationsMatrixDamaging.csv'), index_col=0)
raw_datasets_dic['CNV'] = MyLib.load_csv(os.path.join(dir, 'OmicsCNGene.csv'), index_col=0)
raw_datasets_dic['Fusion'] = MyLib.load_csv(os.path.join(dir, 'OmicsFusionFiltered.csv'))

input_data/Additional_Data/DepMap/CRISPRGeneEffect.csv  is loaded, shape:  (1100, 18443)

input_data/Additional_Data/DepMap/OmicsSomaticMutationsMatrixHotspot.csv  is loaded, shape:  (1111, 467)

input_data/Additional_Data/DepMap/OmicsSomaticMutationsMatrixDamaging.csv  is loaded, shape:  (1111, 17390)

input_data/Additional_Data/DepMap/OmicsCNGene.csv  is loaded, shape:  (1111, 25368)

input_data/Additional_Data/DepMap/OmicsFusionFiltered.csv  is loaded, shape:  (33328, 17)



In [4]:
# MAF file containing mutations: ['NONSENSE', 'FRAME_SHIFT_DEL', 'FRAME_SHIFT_INS', 'MISSENSE', 'SILENT', 'SPLICE_SITE', 'IN_FRAME_INS', 'NONSTOP', 'IN_FRAME_DEL']
mut_maf_file = 'Main_Data/Mutations/all_cell_lines_mut_type.csv'
mut_maf_df = MyLib.load_csv(mut_maf_file)

input_data/Main_Data/Mutations/all_cell_lines_mut_type.csv  is loaded, shape:  (127291, 7)



In [5]:
upset_file = 'cell_lines_master_upset_2025-2-24.csv' # List of cell lines
upset_df = MyLib.load_csv(upset_file, index_col=0)

input_data/cell_lines_master_upset_2025-2-24.csv  is loaded, shape:  (87, 11)



In [16]:
datasets_dic = Dataset.create_dataset(upset_df, raw_datasets_dic, mut_maf_df)
datasets_dic.keys()

dict_keys(['CRISPR', 'Mut_Hot', 'Mut_Dam', 'CNV', 'Fusion', 'Mut_other'])

In [20]:
MyLib.save_csv(datasets_dic['CRISPR'].T, 'CRISPR.csv')
MyLib.save_csv(datasets_dic['CNV'].T, 'CNV.csv')
MyLib.save_csv(datasets_dic['Fusion'].T, 'Fusion.csv')
MyLib.save_csv(datasets_dic['Mut_Dam'].T, 'Mut_Dam.csv')
MyLib.save_csv(datasets_dic['Mut_Hot'].T, 'Mut_Hot.csv')
MyLib.save_csv(datasets_dic['Mut_other'].T, 'Mut_other.csv')


File  data_preprocessing/output_data/CRISPR.csv saved, shape: (18443, 60)

File  data_preprocessing/output_data/CNV.csv saved, shape: (25368, 71)

File  data_preprocessing/output_data/Fusion.csv saved, shape: (1231, 68)

File  data_preprocessing/output_data/Mut_Dam.csv saved, shape: (17390, 71)

File  data_preprocessing/output_data/Mut_Hot.csv saved, shape: (467, 71)

File  data_preprocessing/output_data/Mut_other.csv saved, shape: (13578, 71)


# CNV: genes of interest

The output file "goi_cnv.csv" should be placed in "input_data/Additional_Data/Selected_Lists".

In [5]:
goi = ['FGFR2', 'KRAS', 'BRAF', 'EGFR', 'ERBB2', 'PTEN', 'PIK3CA',
       'AKT1', 'ARID1A', 'PBRM1', 'ARID2', 'BAP1', 'KMT2A', 'KMT2D',
       'TET2', 'IDH1', 'IDH2','SMAD4', 'TGFBR1', 'TGFBR2', 'ELF3',
       'CTNNB1', 'APC', 'AXIN1', 'TP53', 'CDKN2A', 'MYC', 'MTAP']

cnv_file = 'Main_Data/CN/CNV.csv'
cnv_tb = MyLib.load_csv(cnv_file, index_col=0)
cnv_tb.index = cnv_tb.index.map(lambda x:x.split(' (')[0])

cnv_tb = cnv_tb.T

CNV_4 = cnv_tb[goi].copy()
CNV_4['Cell'] = CNV_4.index.values

def cnv_prepare(value):
    if value<0.25:
        return 'Homo_loss'
    elif value >= 0.25 and value < 0.5:
        return "Het_loss"
    elif value > 2:
        return "Gain"
    else:
        return 'Neutral'

CNV_5 = pd.melt(CNV_4, id_vars='Cell')
CNV_5['Copy_number'] = CNV_5['value'].apply(cnv_prepare)

CNV_5 = CNV_5[CNV_5['Copy_number'] != 'Neutral']
CNV_5 = CNV_5[['Gene', 'Cell', 'Copy_number', 'value']].copy()
CNV_5.columns = ['Gene', 'Sample_name', 'CN', 'value']
CNV_5.sort_values(by='Gene', inplace=True)

MyLib.save_csv(CNV_5, 'goi_cnv.csv', index=False)

input_data/Main_Data/CN/CNV.csv  is loaded, shape:  (25368, 71)


File  data_preprocessing/output_data/goi_cnv.csv saved, shape: (74, 4)
