In [None]:
# import numpy and pandas
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math

%config Completer.use_jedi=False
import os


In [None]:
#Cell line feature Generation

In [None]:
#Loading cell line expression
#Expression file is 'RMA normalised expression data for cell-lines' 
#from GDSC data portal(https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Home.html)
expression_df=pd.read_csv('RawFile/Cell_line_RMA_proc_basalExp.txt',sep='\t',index_col=0)
expression_df=expression_df[expression_df.columns[1:]]

In [None]:
#Loading cell line information
#Cell line information file is 'Annotated list of cell-lines' 
#from GDSC data portal(https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Home.html)
cellline_information=pd.read_excel('RawFile/TableS1E.xlsx',index_col=0)
cellline_information=cellline_information.iloc[3:-1]
cellline_information=cellline_information[cellline_information.columns[0:2]]
cellline_information.columns=['Cell line name','COSMIC identifier']

In [None]:
#Excluding cell lines whose expression values are not valid
cellline_list=cellline_information['COSMIC identifier']
cellline_list=[str(x) for x in cellline_list]
cosmic_list=expression_df.columns
cosmic_list=[x[5:] for x in cosmic_list]
isin_list=[(cosmic in cellline_list) for cosmic in cosmic_list]
expression_df=expression_df.loc[:,isin_list]

In [None]:
#Excluding expressions that are not gene
expression_list=expression_df.index
expression_list=[str(x) for x in expression_list]
isin_list=[x!='nan' for x in expression_list]
expression_df=expression_df.loc[isin_list]

In [None]:
#Converting COSMIC identifier into Cell line name
cellline_name_dic={}
for idx,x in cellline_information.iterrows():
    cellline_name_dic[str(x['COSMIC identifier'])]=x['Cell line name']
cosmic_list=expression_df.columns
cosmic_list=[x[5:] for x in cosmic_list]
cellline_new_col=[cellline_name_dic[cosmic] for cosmic in cosmic_list]
expression_df.columns=cellline_new_col

In [None]:
#Transform expression values into z-score
from scipy.stats import zscore
expression_df=expression_df.apply(zscore)

In [None]:
expression_df.index=expression_df.index.rename('Gene_Symbol')

In [None]:
expression_df.to_csv('ProcessedFile/expression.csv')

In [None]:
#Make gene expressions grouped into gene sets

In [None]:
#Loading Gene Set
#Gene Set File (gmt) is 'KEGG subset of CP' from MSigDB (http://www.gsea-msigdb.org/gsea/msigdb/collections.jsp)
GeneSet_List=[]
GeneSetFile='RawFile/c2.cp.kegg.v6.2.symbols.gmt'
with open(GeneSetFile) as f:
    reader = csv.reader(f)
    data = list(list(rec) for rec in csv.reader(f, delimiter='\t')) #reads csv into a list of lists
    for row in data:
        GeneSet_List.append(row)

GeneSet_Dic={}
for GeneSet in GeneSet_List:
    GeneSet_Dic[GeneSet[0]]=GeneSet[2:]

#Delete genes that are not valid
#In here, E3 is just a name of one of cell line that is valid
GeneSet_Dic_withoutNA={}
for GeneSet in GeneSet_Dic:
    GeneSet_Dic_withoutNA[GeneSet]=expression_df['ES3'][GeneSet_Dic[GeneSet]].dropna().index.values


In [None]:
expression_df=expression_df.transpose()

In [None]:
def CelllineFeatureExtract(ExpressionMatrix, GeneSetDic, CellLine):
    X_Feature=[]
    for GeneSet in GeneSetDic.keys():
        Gene_in_GeneSet=[]
        for Gene in GeneSetDic[GeneSet]:
            Gene_in_GeneSet.append(Gene)
        X_Feature.append(ExpressionMatrix[Gene_in_GeneSet].loc[[CellLine]])
    
    return X_Feature

In [None]:
cellline_input=[]
for i in range(len(GeneSet_Dic_withoutNA)):
    cellline_input.append(pd.DataFrame())
for cellline in expression_df.index:
    x=CelllineFeatureExtract(expression_df,GeneSet_Dic_withoutNA,cellline)
    for j in range(len(GeneSet_Dic_withoutNA)):
        cellline_input[j]=cellline_input[j].append(x[j])

In [None]:
for idx,df in enumerate(cellline_input):
    df.to_csv('ProcessedFile/CellLine/'+str(idx)+'.csv')

In [None]:
expression_df=expression_df.transpose()

In [None]:
#Loading drug information
#It is attached to the source code and SMILESs of each drug were manually collected from DrugBank and PubChem
#Morgan fingerprint was calculated by using RDKit
#The drug information with Morgan fingerprint can be used as the input feature directly
drug_df=pd.read_csv('ProcessedFile/Drug.csv',index_col=0)

In [None]:
drug_list=drug_df.index

In [None]:
#GDSC response data preprocessing

In [None]:
#Loading GDSC response data
#GDSC response data is 'GDSC1-dataset'
#from GDSC Downloads pages (https://www.cancerrxgene.org/downloads/bulk_download)
GDSC_response=pd.read_excel('RawFile/v17.3_fitted_dose_response.xlsx')
GDSC_response=GDSC_response[['DRUG_NAME','CELL_LINE_NAME','LN_IC50']]
GDSC_response=GDSC_response.reset_index()
GDSC_response.columns=['Origin_idx','Drug name','Cell line name','IC50']

In [None]:
#Excluding cell line-drug pair whose cell line information or drug information is not valid
cellline_in_GDSC=GDSC_response['Cell line name']
drug_in_GDSC=GDSC_response['Drug name']
is_valid_cellline=[(cellline in expression_df.columns) for cellline in cellline_in_GDSC]
is_valid_drug=[(drug in drug_list) for drug in drug_in_GDSC]
is_valid_all=[(cellline_validity&drug_validaity) for cellline_validity,drug_validaity in zip(is_valid_cellline,is_valid_drug)]
GDSC_response=GDSC_response.loc[is_valid_all]


In [None]:
GDSC_response.to_csv('GDSC_response.csv',index=False)