In [1]:
import pandas as pd
import numpy as np

from collections import Counter

In [14]:
nest_gene_list = pd.read_csv("../data/all_genes/gene_list.txt", header=None, names=(['G']))

gene_index = pd.read_csv("../data/gene2ind.txt", sep="\t", header=None, names=(['I', 'G']))

ccle_map = pd.read_csv("../data/all_genes/ccle_maf.txt", sep='\t')

cell_annot = pd.read_csv("../data/all_genes/cell_line_annotations.txt", sep="\t")

cell_index = pd.read_csv("../data/cell2ind.txt", sep="\t", header=None, names=['I', 'C'])

orig_cell_index = pd.read_csv("../../DrugCell/data/cell2ind.txt", sep="\t", header=None, names=['I', 'C'])

drug_index = pd.read_csv("../data/drug2ind.txt", sep="\t", header=None, names=['I', 'D'])

all_df = pd.read_csv("../data/drugcell_all.txt", sep='\t', header=None, names=['C', 'D', 'AUC'])

In [72]:
#Get all genes from NeST file

gene_map = dict()
for i, row in nest_gene_list.iterrows():
    genes = row['G'].split("\t")
    for gene in genes:
        if gene in gene_map.keys():
            count = gene_map[gene]
            gene_map[gene] = count + 1
        else:
            gene_map[gene] = 1

nest_gl = list(gene_map.keys())

In [99]:
#Remove cell lines from DrugCell data not present in CCLE

cell_id_df = pd.DataFrame(columns=['C', 'ID'])
cell_id_map = dict()

cell_list = list(cell_index['C'])

for index, row in cell_annot.iterrows():
    if row['CCLE_ID'] in cell_list:
        cell_id_df = cell_id_df.append({'C' : row['CCLE_ID'], 'ID' : row['depMapID']}, ignore_index=True)
        cell_id_map[row['depMapID']] = row['CCLE_ID']

cell_id_df

Unnamed: 0,C,ID
0,DMS53_LUNG,ACH-000698
1,SW1116_LARGE_INTESTINE,ACH-000489
2,NCIH1694_LUNG,ACH-000431
3,P3HR1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACH-000707
4,HUT78_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACH-000509
...,...,...
1209,SARC9371_BONE,ACH-002396
1210,MCC26_SKIN,ACH-001552
1211,NBSUSSR_AUTONOMIC_GANGLIA,ACH-002340
1212,GEO_LARGE_INTESTINE,ACH-002394


In [134]:
#Get all valid cell-gene pairs in form of dict {cell : genes[]}

mut_list = ['Missense_Mutation', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 
            'Splice_Site', 'Splice_Region', 'In_Frame_Del', 'In_Frame_Ins', 'Nonstop_Mutation']

cell_gene_map = dict()
gene_list = []

for index, row in ccle_map.iterrows():
    gene = row['Hugo_Symbol']
    cell_id = row['Broad_ID']
    if cell_id in cell_id_map and gene in nest_gl and row['Variant_Classification'] in mut_list:
        gene_list.append(gene)
        genes = []
        cell = cell_id_map[cell_id]
        if cell in cell_gene_map:
            genes = cell_gene_map[cell]
        genes.append(gene)
        cell_gene_map[cell] = genes


In [135]:
#Getting top 30% genes

gene_counter = Counter(gene_list).most_common(3600) #top 30% of 11992 common genes

gene_list = sorted(tup[0] for tup in gene_counter)

cell_list = sorted(cell_gene_map.keys())

In [136]:
nest_gene_index = pd.DataFrame(gene_list, columns=(['G']))
nest_gene_index.to_csv("../data/gene2ind.txt", sep='\t', header=False, index=True)

In [113]:
cell_index = pd.DataFrame(cell_list, columns=(['C']))
cell_index.to_csv("../data/cell2ind.txt", sep='\t', header=False, index=True)

In [137]:
#Converting cell_gene_map into boolean form

cell_gene_df = pd.DataFrame(columns=gene_list)

i = 0
for cell in cell_list:
    ko_list = []
    mut_list = cell_gene_map[cell]
    for gene in gene_list:
        if gene in mut_list:
            ko_list.append(1)
        else:
            ko_list.append(0)
    cell_gene_df.loc[i] = ko_list
    i += 1
    
cell_gene_df

Unnamed: 0,A1CF,A2M,A2ML1,AARS,AARS2,AASDH,AASS,ABCA1,ABCA13,ABCA2,...,ZNF831,ZNF835,ZNF845,ZNF865,ZNFX1,ZRANB3,ZSWIM6,ZXDC,ZZEF1,ZZZ3
0,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,0,1,1,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1211,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
cell_gene_df.to_csv("../data/cell2mutation.txt", header=False, index=False)

In [15]:
#Removing cell lines from train data not in the newly filtered cell-line list

for i, row in all_df.iterrows():
    if row['C'] not in cell_list:
        all_df.drop(i, inplace=True)
        
all_df

NameError: name 'cell_list' is not defined

In [141]:
all_df.to_csv("../data/drugcell_all.txt", sep='\t', header=False, index=False)

In [17]:
#Removing 100 random drugs to create train/test drug lists

drugs = np.array(drug_index['D'])
test_drugs = drugs[np.random.choice(len(drugs), size = 100, replace = False)]

In [21]:
#Filter out test drugs from drugcell_all file

all_df = pd.read_csv("../data/drugcell_all.txt", sep='\t', header=None, names=['C', 'D', 'AUC'])

train_list = []
test_list = []
for _,row in all_df.iterrows():
    if row['D'] in test_drugs:
        test_list.append({'C':row['C'], 'D':row['D'], 'AUC':row['AUC']})
    else:
        train_list.append({'C':row['C'], 'D':row['D'], 'AUC':row['AUC']})


In [22]:
train_df = pd.DataFrame(train_list)
test_df = pd.DataFrame(test_list)
test_df

Unnamed: 0,C,D,AUC
0,697_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,C1OC2=C(O1)C(=C3C(=C2)C4=C[C@@H]([C@H]([C@H]([...,0.266457
1,RPMI6666_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,C[C@H]1COCCN1C2=NC(=NC3=C2C=CC(=N3)C4=CC(=C(C=...,0.561190
2,CAL148_BREAST,CN(C)CC[C@H](CSC1=CC=CC=C1)NC2=C(C=C(C=C2)S(=O...,0.860451
3,CHSA0011_BONE,C1=CC=C2C(=C1)C=CC3=C2C=CC(=C3)C4=CC(=NN4C5=CC...,0.965073
4,CMK_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,CC(C)N1CCC(CC1)NC2=NC(=NC3=CC(=C(C=C32)OC)OCCC...,0.945657
...,...,...,...
72501,HPAFII_PANCREAS,C[C@@H]1CC[C@H]2C[C@@H](C(=CC=C/C=C/[C@H](C[C@...,0.803420
72502,SNU878_LIVER,CC(C)C[C@H]([C@@H](C(=O)NO)O)C(=O)N[C@@H](C1=C...,0.716823
72503,CAL72_BONE,CC1=CC(=CC=C1)NC2=NC(=CS2)C3=CC=NC=C3,0.956428
72504,ISTMEL1_SKIN,CC1=CN2C(=O)C=C(N=C2C(=C1)[C@@H](C)NC3=CC=CC=C...,0.643823


In [23]:
train_df.to_csv("../data/drugcell_train_compare.txt", sep='\t', header=False, index=False)
test_df.to_csv("../data/drugcell_test_compare.txt", sep='\t', header=False, index=False)

In [None]:
gene_index = pd.read_csv("../data/gene2ind.txt", sep="\t", header=None, names=(['I', 'G']))
cell_gene_df = pd.read_csv("../data/cell2mutation.txt", header=False, )