In [2]:
import pandas as pd

from collections import Counter

In [3]:
nest_gene_list = pd.read_csv("../data/nest/gene_list.txt", header=None, names=(['G']))

gene_index = pd.read_csv("../data/gene2ind.txt", sep="\t", header=None, names=(['I', 'G']))

ccle_map = pd.read_csv("../data/nest/ccle_maf.txt", sep='\t')

cell_annot = pd.read_csv("../data/nest/cell_line_annotations.txt", sep="\t")

cell_index = pd.read_csv("../data/nest/cell2ind.txt", sep="\t", header=None, names=['I', 'C'])

orig_cell_index = pd.read_csv("../data/cell2ind.txt", sep="\t", header=None, names=['I', 'C'])

In [139]:
train_df = pd.read_csv("../data/drugcell_all.txt", sep='\t', header=None, names=['C', 'D', 'AUC'])

test_df = pd.read_csv("../data/drugcell_test.txt", sep='\t', header=None, names=['C', 'D', 'AUC'])

val_df = pd.read_csv("../data/drugcell_val.txt", sep='\t', header=None, names=['C', 'D', 'AUC'])

In [72]:
gene_map = dict()
for i, row in nest_gene_list.iterrows():
    genes = row['G'].split("\t")
    for gene in genes:
        if gene in gene_map.keys():
            count = gene_map[gene]
            gene_map[gene] = count + 1
        else:
            gene_map[gene] = 1

nest_gl = list(gene_map.keys())

In [84]:
#print(ccle_map.head())
#print(cell_annot.head())

In [99]:
cell_id_df = pd.DataFrame(columns=['C', 'ID'])
cell_id_map = dict()

cell_list = list(cell_index['C'])

for index, row in cell_annot.iterrows():
    if row['CCLE_ID'] in cell_list:
        cell_id_df = cell_id_df.append({'C' : row['CCLE_ID'], 'ID' : row['depMapID']}, ignore_index=True)
        cell_id_map[row['depMapID']] = row['CCLE_ID']

cell_id_df

Unnamed: 0,C,ID
0,DMS53_LUNG,ACH-000698
1,SW1116_LARGE_INTESTINE,ACH-000489
2,NCIH1694_LUNG,ACH-000431
3,P3HR1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACH-000707
4,HUT78_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACH-000509
...,...,...
1209,SARC9371_BONE,ACH-002396
1210,MCC26_SKIN,ACH-001552
1211,NBSUSSR_AUTONOMIC_GANGLIA,ACH-002340
1212,GEO_LARGE_INTESTINE,ACH-002394


In [134]:
mut_list = ['Missense_Mutation', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 
            'Splice_Site', 'Splice_Region', 'In_Frame_Del', 'In_Frame_Ins', 'Nonstop_Mutation']

cell_gene_map = dict()
gene_list = []

for index, row in ccle_map.iterrows():
    gene = row['Hugo_Symbol']
    cell_id = row['Broad_ID']
    if cell_id in cell_id_map and gene in nest_gl and row['Variant_Classification'] in mut_list:
        gene_list.append(gene)
        genes = []
        cell = cell_id_map[cell_id]
        if cell in cell_gene_map:
            genes = cell_gene_map[cell]
        genes.append(gene)
        cell_gene_map[cell] = genes

In [135]:
gene_counter = Counter(gene_list).most_common(3600) #top 30% of 11992 common genes

gene_list = sorted(tup[0] for tup in gene_counter)

cell_list = sorted(cell_gene_map.keys())

In [136]:
nest_gene_index = pd.DataFrame(gene_list, columns=(['G']))
nest_gene_index.to_csv("../data/nest/gene2ind.txt", sep='\t', header=False, index=True)

In [113]:
cell_index = pd.DataFrame(cell_list, columns=(['C']))
cell_index.to_csv("../data/nest/cell2ind.txt", sep='\t', header=False, index=True)

In [137]:
cell_gene_df = pd.DataFrame(columns=gene_list)

i = 0
for cell in cell_list:
    ko_list = []
    mut_list = cell_gene_map[cell]
    for gene in gene_list:
        if gene in mut_list:
            ko_list.append(1)
        else:
            ko_list.append(0)
    cell_gene_df.loc[i] = ko_list
    i += 1
    
cell_gene_df

Unnamed: 0,A1CF,A2M,A2ML1,AARS,AARS2,AASDH,AASS,ABCA1,ABCA13,ABCA2,...,ZNF831,ZNF835,ZNF845,ZNF865,ZNFX1,ZRANB3,ZSWIM6,ZXDC,ZZEF1,ZZZ3
0,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,0,1,1,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1211,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
cell_gene_df.to_csv("../data/nest/cell2mutation.txt", header=False, index=False)

In [140]:
for i, row in train_df.iterrows():
    if row['C'] not in cell_list:
        train_df.drop(i, inplace=True)
        
for i, row in test_df.iterrows():
    if row['C'] not in cell_list:
        test_df.drop(i, inplace=True)
        
for i, row in val_df.iterrows():
    if row['C'] not in cell_list:
        val_df.drop(i, inplace=True)
        
train_df

Unnamed: 0,C,D,AUC
0,NCIH196_LUNG,C1=CC(=CNNC(=O)COC2=C(C=C(C=C2Cl)Cl)Cl)N=C1,0.909415
1,T24_URINARY_TRACT,CC1=C(C=C(C=C1)NC(=O)C2=CC=C(C=C2)CN3CCN(CC3)C...,0.930472
2,WIL2NS_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,C1/C(=C\C2=CC=CS2)/C(=O)/C(=C/C3=CC=CS3)/C1,0.790564
3,COV434_OVARY,C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC...,0.408251
4,MHHCALL4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,CC(C)(C)OC(=O)NC1=CC=C(C=C1)C2=CC(=NO2)C(=O)NC...,0.692236
...,...,...,...
509289,JK1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,CCC1=CC=CC=C1NC(=O)CSC(=O)NNC(=O)[C@@H](CC2=CN...,0.829568
509290,A427_LUNG,CCN1C=C(C(=N1)C2=CC=C(C=C2)NC(=O)N(C)C)C3=C4C=...,0.860422
509291,ETK1_BILIARY_TRACT,CC1=C(C=C(C=C1)C(=O)NC2=CC(=CC(=C2)N3C=C(N=C3)...,0.951400
509292,NCIH2172_LUNG,C1=CC=C(C=C1)S(=O)(=O)N(CC(F)(F)F)C2=CC=C(C=C2...,0.953532


In [141]:
train_df.to_csv("../data/nest/drugcell_train.txt", sep='\t', header=False, index=False)

test_df.to_csv("../data/nest/drugcell_test.txt", sep='\t', header=False, index=False)

In [129]:
val_df.to_csv("../data/nest/drugcell_val.txt", sep='\t', header=False, index=False)