##### This jupyter notebooks is analyze which genes are associated with drug sensitivity using a Lasso regression model
##### Date: Feb 22, 2023


In [3]:
#import libraries
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import linear_model 

In [5]:
#Load data
# Gene expression data from BeatAML wave 1/2
GE = pd.read_csv("../../../Data_resource/RPKM.csv",index_col = "Unnamed: 0")
GE.index = GE['Gene']

# Drug response data from BeatAML wave 1/2
Drug_data = pd.read_csv("../../../Data_resource/Drug_Response.csv",index_col = "Unnamed: 0")

In [3]:
# gene id mapping
gene_map = GE.loc[:,['Gene','Symbol']]
dic_gene_map = {}
for i in range(0,gene_map.shape[0]):
    dic_gene_map[gene_map.iloc[i,0]] = gene_map.iloc[i,1]


# full gene list and sample list
gene_list = (list(GE.index))
sample_list = list(set(GE.columns) - set(['Gene','Symbol']))

In [4]:
# Formated input gene expression matrix
input_data = GE.loc[list(gene_list), sample_list]

In [5]:
def Filter_low_Expr(arr):
    '''
    Usage: This function is uded to filter out genes which has expression value logRPKM smaller than 0 in more than 50% of the samples.
    Parameter: arr, expression of one gene across different samples
    '''
    total_len = len(arr)
    count = 0
    for i in arr:
        if i > 0:
            count = count + 1
    if count > 0.5*total_len: #If 
        return(True)
    else:
        return(False)

In [6]:
# Filter genes and scale the data
scaled_df = pd.DataFrame()
array_list = list()
length = input_data.shape[0]
gene_list_remain = []
for i in range(0,length):
    gene = gene_list[i]
    arr = input_data.iloc[i,:]
    if Filter_low_Expr(arr):
        array_mean = np.mean(arr)
        array_std = np.std(arr)
        array_scale = (arr - array_mean)/array_std
        array_list.append(array_scale.values)
        gene_list_remain.append(gene)
    
scaled_df = pd.DataFrame(array_list)
scaled_df.columns = sample_list
scaled_df.index = gene_list_remain
scaled_df_t = scaled_df.transpose()

In [7]:
# Run Lasso regression model
result_list = []
Drug_list = list(set(Drug_data['inhibitor']))

for drug in Drug_list:
    print(drug)
    new_coef = []
    Drug_sele_df = Drug_data.loc[Drug_data['inhibitor'] == drug]
    Drug_sele_df.index = Drug_sele_df['lab_id']
    Drug_sele_df =Drug_sele_df['auc']
    #Drug_sele_df

    df_sele = pd.concat([scaled_df_t, Drug_sele_df],axis = 1)
    df_sele = df_sele.dropna()

    Y = df_sele['auc'].values
    
    Y = stats.zscore(Y)
    X = df_sele.loc[:,gene_list_remain]
    #Fit the model
    linreg = linear_model.Lasso(alpha=0.1) 
    linreg.fit(X,Y)
    y_pred = linreg.predict(X)

    coef = list(linreg.coef_)

    new_coef.extend(coef)
    rss = sum((y_pred-Y)**2)
    new_coef.extend([rss])
    new_coef.extend([linreg.intercept_])
    result_list.append(new_coef)


CYT387
PP242
Masitinib (AB-1010)
Erlotinib
Motesanib (AMG-706)
Sorafenib
Neratinib (HKI-272)
BI-2536
BMS-345541
Linifanib (ABT-869)
Pazopanib (GW786034)
Lapatinib
PHA-665752
GSK-1904529A
Tozasertib (VX-680)
Regorafenib (BAY 73-4506)
Selinexor
Barasertib (AZD1152-HQPA)
TG100-115
Foretinib (XL880)
Cabozantinib
Crenolanib
S31-201
GSK690693
Idelalisib
NVP-TAE684
SNS-032 (BMS-387032)
Entrectinib
JNJ-38877605
Axitinib (AG-013736)
Vatalanib (PTK787)
CI-1040 (PD184352)
Palbociclib
A-674563
PRT062607
Midostaurin
STO609
Crizotinib (PF-2341066)
RAF265 (CHIR-265)
Roscovitine (CYC-202)
GSK-1838705A
Lenalidomide
KI20227
Vemurafenib (PLX-4032)
Nutlin 3a
Ponatinib (AP24534)
Tivozanib (AV-951)
Volasertib (BI-6727)
Dovitinib (CHIR-258)
JNJ-28312141
LY-333531
Ibrutinib (PCI-32765)
Tandutinib (MLN518)
GDC-0941
XAV-939
MGCD-265
NF-kB Activation Inhibitor
SGX-523
Doramapimod (BIRB 796)
PHT-427
BEZ235
Quizartinib (AC220)
Lenvatinib
Saracatinib (AZD0530)
Panobinostat
SR9011
Vandetanib (ZD6474)
JQ1
TG101348
SB

In [8]:
result_df = pd.DataFrame(result_list)
result_df.columns = gene_list_remain +  ["RSS", "intercept"]
result_df.index = Drug_list


In [9]:
result_df.to_csv("S2_Expr_drug_lasso_Coef_features_over0_gt50p.csv")

In [10]:
x = result_df.transpose()


In [11]:
# KG formating
Drug_list_all =[]
Gene_list_all = []
coef_list_all = []


for Drug in list(x.columns):
    Drug_cur_list =[]
    Features_sele = list(x.loc[x[Drug]!=0].index)
    Features_sele[-2] = Drug + '_' +Features_sele[-2]
    Features_sele[-1] = Drug + '_' +Features_sele[-1]
    coef = list(x.loc[x[Drug]!=0][Drug].values)
    Drug_cur_list = [Drug] * len(Features_sele)
    
    Drug_list_all = Drug_list_all + Drug_cur_list
    Gene_list_all = Gene_list_all + Features_sele
    coef_list_all = coef_list_all + coef
    
gene_symbol_list = []
for gene in Gene_list_all:
    if gene in dic_gene_map:
        gene_symbol_list.append(dic_gene_map[gene])
    else:
        gene_symbol_list.append(gene)
result_format = pd.DataFrame({"Drug":Drug_list_all,
                              "Gene":Gene_list_all, 
                              "Symbol":gene_symbol_list,
                              "coef":coef_list_all, 
                              "Method":['Lasso'] *len(coef_list_all), 
                              "Dataset":['BeatAML']*len(coef_list_all)})
#

In [12]:
result_format.to_csv("S2_KG_Expr_drug_lasso_Coef_features_over0_gt50p.csv") # Supplemental table 2

# End