In [None]:
import random as rd
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
ccle_map = pd.read_csv("../data/CCLE/ccle_maf.txt", sep='\t')

cell_annot = pd.read_csv("../data/CCLE/cell_line_annotations.txt", sep="\t")

cell_index = pd.read_csv("../data/cell2ind.txt", sep="\t", header=None, names=['I', 'C'])

drug_index = pd.read_csv("../data/drug2ind.txt", sep="\t", header=None, names=['I', 'D'])

all_df = pd.read_csv("../data/drugcell_all.txt", sep='\t', header=None, names=['C', 'D', 'AUC'])

gene_panels = pd.read_csv("../data/ClinicalGenePanels.txt", sep='\t')

ctDNA_genes_df = pd.read_csv("../data/gene_list_ctDNA.txt", sep="\t", header=None, names=['G'])

nest_gene_list = []
with open('../data/gene_list_NeST.txt', 'r') as file:
    nest_gene_list = file.read().split()
    
clinical_trial_gene_list = []
with open('../data/gene_list_clinical_trial.txt', 'r') as file:
    clinical_trial_gene_list = file.read().split()
    
ddram_gene_list = []
with open('../data/DDRAM/gene_list_DDRAM.txt', 'r') as file:
    ddram_gene_list = file.read().split()

In [None]:
#Create genotype (cell2mutation) data

def create_genotype_data(gene_list, cell_list, cell_gene_map):

    cell_gene_df = pd.DataFrame(columns=gene_list)

    i = 0
    for cell in cell_list:
        ko_list = []
        mut_list = cell_gene_map[cell]
        for gene in filtered_gene_list:
            if gene in mut_list:
                ko_list.append(1)
            else:
                ko_list.append(0)
        cell_gene_df.loc[i] = ko_list
        i += 1
        
    return cell_gene_df

In [None]:
# creating clinical trial gene list

#ctDNA_genes = list(ctDNA_genes_df['G'])
#fm_tempus_genes = list(gene_panels.query('`FM One` == 1 & `Tempus xT` == 1')['Gene'])
#clinical_trial_genes = []
#clinical_trial_genes.extend(ctDNA_genes)
#clinical_trial_genes.extend(fm_tempus_genes)
#clinical_trial_genes = sorted(set(clinical_trial_genes))

In [None]:
#Remove cell lines from DrugCell data not present in CCLE

cell_id_map = dict()
cell_list = list(cell_index['C'])

for _,row in cell_annot.iterrows():
    if row['CCLE_ID'] in cell_list:
        cell_id_map[row['CCLE_ID']] = row['depMapID']


In [None]:
#Get all valid cell-gene_list pairs in form of dict {cell : genes[]}

mut_list = ['Missense_Mutation', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 
            'Splice_Site', 'Splice_Region', 'In_Frame_Del', 'In_Frame_Ins', 'Nonstop_Mutation']

filtered_ccle_map_df = ccle_map.query('Broad_ID in @cell_id_map.values() and Variant_Classification in @mut_list and Hugo_Symbol in @nest_gene_list')

cell_gene_map = dict()
for cell in cell_list:
    broad_id = cell_id_map[cell]
    cell_gene_map[cell] = list(filtered_ccle_map_df.query('Broad_ID == @broad_id')['Hugo_Symbol'])

In [None]:
n = len(clinical_trial_gene_list)

In [None]:
filtered_gene_list = clinical_trial_gene_list

gene2ind_file = "../data/gene2ind_clinical_trial.txt"
cell2mut_file = "../data/cell2mutation_clinical_trial.txt"

In [None]:
#Most frequently mutated genes

gene_mutation_freq = Counter(list(filtered_ccle_map_df['Hugo_Symbol']))
filtered_gene_list = sorted(tup[0] for tup in gene_mutation_freq.most_common(n))

gene2ind_file = "../data/gene2ind_mf_" + str(n) + ".txt"
cell2mut_file = "../data/cell2mutation_mf_" + str(n) + ".txt"

In [None]:
#Random genes

filtered_gene_list = sorted(rd.sample(list(filtered_ccle_map_df['Hugo_Symbol'].unique()), n))

gene2ind_file = "../data/gene2ind_random_" + str(n) + ".txt"
cell2mut_file = "../data/cell2mutation_random_" + str(n) + ".txt"

In [None]:
gene_index = pd.DataFrame(filtered_gene_list, columns=(['G']))
gene_index.to_csv(gene2ind_file, sep='\t', header=False, index=True)

cell_gene_df = create_genotype_data(filtered_gene_list, cell_list, cell_gene_map)
cell_gene_df.to_csv(cell2mut_file, header=False, index=False)

In [None]:
cell_gene_df

In [None]:
#Generate DDRAM data

mut_list = ['Missense_Mutation', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 
            'Splice_Site', 'Splice_Region', 'In_Frame_Del', 'In_Frame_Ins', 'Nonstop_Mutation']

filtered_ccle_map_df = ccle_map.query('Broad_ID in @cell_id_map.values() and Variant_Classification in @mut_list and Hugo_Symbol in @ddram_gene_list')

cell_gene_map = dict()
for cell in cell_list:
    broad_id = cell_id_map[cell]
    cell_gene_map[cell] = list(filtered_ccle_map_df.query('Broad_ID == @broad_id')['Hugo_Symbol'])

In [None]:
filtered_gene_list = ddram_gene_list

gene2ind_file = "../data/gene2ind_ddram.txt"
cell2mut_file = "../data/cell2mutation_ddram.txt"

gene_index = pd.DataFrame(filtered_gene_list, columns=(['G']))
gene_index.to_csv(gene2ind_file, sep='\t', header=False, index=True)

cell_gene_df = create_genotype_data(filtered_gene_list, cell_list, cell_gene_map)
cell_gene_df.to_csv(cell2mut_file, header=False, index=False)

cell_gene_df

In [None]:
onco_kb_data = pd.read_csv("../data/ONCO_KB/oncokb.tsv", sep='\t')
drug_names_df = pd.read_csv("../data/compound_names.txt", sep='\t')

In [None]:
onco_kb_drugs = set(onco_kb_data['Drugs'])
drug_name_map = dict(zip(drug_names_df.Name, drug_names_df.SMILE))

In [None]:
onco_kb_drugs_smile = []
for d in onco_kb_drugs:
    if d in drug_name_map:
        onco_kb_drugs_smile.append(drug_name_map[d])

onco_kb_all_df = all_df.query("D in @onco_kb_drugs_smile")

In [None]:
train_df = onco_kb_all_df.sample(frac=0.8)
test_df = onco_kb_all_df.drop(train_df.index)

In [None]:
onco_kb_all_df.to_csv("../data/drugcell_oncokb_all.txt", sep='\t', header=False, index=False)
test_df.to_csv("../data/drugcell_oncokb_test.txt", sep='\t', header=False, index=False)
train_df.to_csv("../data/drugcell_oncokb_train.txt", sep='\t', header=False, index=False)

In [None]:
onco_kb_drug_df = drug_index.query("D in @onco_kb_drugs_smile")

In [None]:
onco_kb_drug_df.to_csv("../data/drug2ind_oncokb.txt", sep='\t', header=False, index=False)