In [1]:
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier

In [2]:
import tensorflow as tf
import xgboost
from tensorflow import keras
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import callbacks
from sklearn.metrics import log_loss
from keras.regularizers import L1, L2

In [5]:
import pandas as pd
import numpy as np
drug = pd.read_csv('C:/Users/adwai/Desktop/EAI 6000/lish-moa/train_features.csv')
target = pd.read_csv('C:/Users/adwai/Desktop/EAI 6000/lish-moa/train_targets_scored.csv')
# setting the sig_id column as index
drug.set_index('sig_id', inplace= True)
target.set_index('sig_id', inplace= True)

treat_drug = drug.query('cp_type == "trt_cp"')
treat_target = target.loc[treat_drug.index]

In [6]:
# Getting list of columns names for categorical features, numerical features, gene epxression related features and cell vialbility related features
cat_cols = drug.select_dtypes(include = 'O').columns.tolist()
num_cols = drug.select_dtypes(exclude = 'O').columns.tolist()
gene_features = [i for i in num_cols if i.startswith('g-')]
cell_viability = [i for i in num_cols if i.startswith('c-')]
cat_cols2 = cat_cols + ['cp_time']
num_cols2 = num_cols
num_cols.remove('cp_time')

In [7]:
# Data prepocesing i.e label encoding 'cp_dose', 'cp_time' and 'cp_type', or whether to drop vehicle/control treated sample rows

qt = QuantileTransformer()

def data_preprocessing(dataframe, only_treatment = True, fit = False, transform = False):
    df = dataframe.copy()
    if fit:
        df[num_cols] = qt.fit_transform(df[num_cols])
    if transform:
        df[num_cols] = qt.transform(df[num_cols])
    df["cp_dose"] = df.cp_dose.map({"D1": 0, "D2":1})
    df["cp_time"] = df.cp_time.map({24: 0,48: 1, 72: 2})
    if only_treatment:
        df = df.drop("cp_type", 1)
    else:
        df["cp_type"] = df.cp_type.map({"trt_cp": 1, "ctl_vehicle":0})
    return df


drug_cleaned = data_preprocessing(dataframe= drug, only_treatment= False, fit= True, transform= False)
drug_treatment = data_preprocessing(dataframe= drug, only_treatment= True,fit= True, transform= False)

In [8]:
# Defining NN model to be optimized using Optuna hyperparameter optimization:

def for_bayes_optimization2(dimension):
    
    [dl1,dl2,dl3,dl4,dp1,dp2,dp3,dp4,regu,regu_val,activation,learning_rate] = dimension
    if (regu == 'l2'):
        act_reg = keras.regularizers.l2(regu_val)
    if (regu =='l1'):
        act_reg = keras.regularizers.l1(regu_val)
    lr = callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 5, verbose = 0)
    
    #x_train,x_val, y_train, y_val = train_test_split(drug_cleaned, target, test_size = 0.3, random_state = 42)
    es = callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 1e-4, mode = 'min', baseline = 0.3 , 
                                 restore_best_weights=False, patience= 30, verbose = 0)
    
    adam = keras.optimizers.Adam(learning_rate = learning_rate)
    
    model = Sequential()
    model.add(Dense(dl1, input_dim = x_train.shape[1], activation = activation, activity_regularizer = act_reg))
    model.add(Dropout(dp1))
    model.add(Dense(dl2, activation = activation))
    model.add(Dropout(dp2))
    model.add(Dense(dl3, activation = activation))
    model.add(Dropout(dp3))
    model.add(Dense(dl4, activation = activation))
    model.add(Dropout(dp4))
    model.add(Dense(y_train.shape[1], activation = 'sigmoid'))
    
    model.compile(optimizer = adam, loss = 'binary_crossentropy', metrics = ['AUC'])
    
    model.fit(x = x_train, y = y_train, validation_data = (x_val, y_val), epochs = 200, batch_size = 128, callbacks = [es], verbose = 0)
    
    log_loss_data = log_loss(np.ravel(y_val), np.ravel(model.predict_proba(x_val)), eps = 1e-7) 
    
    return model # or return log_loss_data (for optuna optimization)

In [9]:
# Best parameters obtained from Optuna 
best_set_from_baysian_optimization = [2048, 1982, 708, 470, 0.6067766671093088, 0.1, 0.4973213653064633, 0.5950996340056243, 'l1', 1e-05, 'swish', 0.0001]

In [10]:
#Prepartion of sample submission file
submission_test = pd.read_csv('C:/Users/adwai/Desktop/EAI 6000/lish-moa/test_features.csv')
submission_test_prob = pd.read_csv('C:/Users/adwai/Desktop/EAI 6000/lish-moa/sample_submission.csv')
submission_test_cleaned = data_preprocessing(dataframe= submission_test, only_treatment= False, fit= False, transform= True)
submission_test_prob.set_index('sig_id', inplace= True)
submission_test_cleaned.set_index('sig_id', inplace = True)
submission_test_cleaned

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,1,0,0,0.217396,0.603225,0.217863,0.738676,0.937020,0.424264,0.385859,...,0.557112,0.884263,0.429596,0.401706,0.386858,0.440853,0.512012,0.475851,0.578194,0.189724
id_001897cda,1,2,0,0.403428,0.661388,0.878769,0.239158,0.302797,0.348047,0.012098,...,0.442686,0.401863,0.149864,0.106197,0.323323,0.260273,0.519339,0.280718,0.132533,0.201157
id_002429b5b,0,0,0,0.605850,0.449502,0.270791,0.577034,0.043516,0.638351,0.324853,...,0.389058,0.686103,0.113394,0.909830,0.083190,0.142849,0.898008,0.945361,0.766178,0.407472
id_00276f245,1,0,1,0.735265,0.640710,0.642988,0.731286,0.200509,0.125268,0.777114,...,0.572455,0.586182,0.412316,0.131242,0.296482,0.163220,0.883970,0.436042,0.801291,0.244133
id_0027f1083,1,1,0,0.286286,0.073566,0.951422,0.619119,0.200035,0.499416,0.744451,...,0.768100,0.871304,0.422532,0.950751,0.796134,0.956814,0.352019,0.260275,0.944870,0.998389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
id_ff7004b87,1,0,0,0.725433,0.244736,0.992029,0.166878,0.858366,0.963123,0.537096,...,0.123297,0.225675,0.302398,0.517580,0.222196,0.281031,0.113952,0.122501,0.302659,0.220102
id_ff925dd0d,1,0,0,0.199220,0.388811,0.978225,0.693815,0.739598,0.668148,0.156614,...,0.516651,0.802818,0.250421,0.895478,0.915573,0.976929,0.642878,0.363325,0.485690,0.461021
id_ffb710450,1,2,0,0.285970,0.441744,0.592303,0.146216,0.526360,0.737286,0.492133,...,0.742385,0.916616,0.408548,0.726392,0.921892,0.760442,0.524992,0.815182,0.301679,0.430848
id_ffbb869f2,1,1,1,0.054888,0.026675,0.268389,0.906907,0.052459,0.395992,0.312918,...,0.672005,0.293344,0.189398,0.887304,0.669002,0.580393,0.367479,0.779530,0.875460,0.483364


In [11]:
#setting initial prediction for all to zeros
submission_test_prob[:] = np.zeros(submission_test_prob.shape)
submission_test_prob

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_001897cda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_00276f245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_0027f1083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
id_ff7004b87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_ff925dd0d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_ffb710450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_ffbb869f2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#setting initial prediction for all to zeros
submission_test_prob[:] = np.zeros(submission_test_prob.shape)
submission_test_prob

Unnamed: 0_level_0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_0004d9e33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_001897cda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_00276f245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_0027f1083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
id_ff7004b87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_ff925dd0d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_ffb710450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
id_ffbb869f2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# For submission_File_prediction
n_splits = 5
sub_file = submission_test_cleaned
sub_file_all_predict = np.zeros(submission_test_prob.shape)
nn_loss = [] # neural network loss
xgb_loss = [] # xgb loss
combined_loss = [] # loss of ensembel of NN and XGB
for seed in [10, 20, 30]: # trying three dfiferent seeds

    for e, (train, val) in enumerate(KFold(n_splits = n_splits, shuffle = True, random_state = seed).split(drug_cleaned, target)):
        x_train, y_train = drug_cleaned.iloc[train], target.iloc[train]
        x_val, y_val = drug_cleaned.iloc[val], target.iloc[val]

        model = for_bayes_optimization2(best_set_from_baysian_optimization)
        
        nn_predict = model.predict_proba(x_val)
        
        sub_file_nn_predict = model.predict_proba(sub_file)
        nn_loss_temp = log_loss(np.ravel(y_val), np.ravel(nn_predict), eps = 1e-7)
        nn_loss.append(nn_loss_temp)
        print(f"NN_log_loss fold {e}, seed {seed}: ", nn_loss_temp)
        
        xgb = MultiOutputClassifier(XGBClassifier(tree_method = 'gpu_hist', n_estimators = 130, max_depth = 3, reg_alpha = 2, min_child_weight = 2,
                                             gamma = 3, learning_rate = 0.0580666601841646, colsample_bytree = 0.58)) # Parameters obtained after optimization with Optuna
        xgb.fit(x_train, y_train)
        xgb_predict = np.array(xgb.predict_proba(x_val))[:,:,1].T
        xgb_loss_temp = log_loss(np.ravel(y_val), np.ravel(xgb_predict), eps = 1e-7)
        xgb_loss.append(xgb_loss_temp)
        
        sub_file_xgb_predict = np.array(xgb.predict_proba(sub_file))[:,:,1].T
        avg_sub_file_predict = (sub_file_nn_predict + sub_file_xgb_predict)/2
        
        sub_file_all_predict = sub_file_all_predict + avg_sub_file_predict
        
        combined_loss_temp  = log_loss(np.ravel(y_val), np.ravel((nn_predict + xgb_predict)/2), eps = 1e-7)
        combined_loss.append(combined_loss_temp)
        
        print(f"xgb_log_loss fold {e}, seed {seed}: ", xgb_loss_temp)
        print(f"combined_loss fold {e}, seed {seed}: ", combined_loss_temp)

Instructions for updating:
Please use `model.predict()` instead.
NN_log_loss fold 0, seed 10:  0.015020919174596525
xgb_log_loss fold 0, seed 10:  0.01566260838977044
combined_loss fold 0, seed 10:  0.014741510938374815
NN_log_loss fold 1, seed 10:  0.015257024776988173
xgb_log_loss fold 1, seed 10:  0.016033999369712403
combined_loss fold 1, seed 10:  0.015177180927144618
NN_log_loss fold 2, seed 10:  0.015525626963964036
xgb_log_loss fold 2, seed 10:  0.016044754093472193
combined_loss fold 2, seed 10:  0.015094921314254404
NN_log_loss fold 3, seed 10:  0.015710833127656896
xgb_log_loss fold 3, seed 10:  0.016146240835265935
combined_loss fold 3, seed 10:  0.015310830377399264
NN_log_loss fold 4, seed 10:  0.01567757226200526
xgb_log_loss fold 4, seed 10:  0.016137118041586546
combined_loss fold 4, seed 10:  0.015269911834963203
NN_log_loss fold 0, seed 20:  0.015447582953109853
xgb_log_loss fold 0, seed 20:  0.016059235614934674
combined_loss fold 0, seed 20:  0.015198326566748419
N

In [None]:
print("Average log loss of NN is :", np.mean(nn_loss), " and standard deviation: ", np.std(nn_loss))
print("Average log loss of Xgboost is :", np.mean(xgb_loss), " and standard deviation: ", np.std(xgb_loss))
print("Combined log loss is :", np.mean(combined_loss), " and standard deviation: ", np.std(combined_loss))

In [None]:
final_predictions = sub_file_all_predict/(n_splits * 3)

In [None]:
submission_test_prob[:] = final_predictions

In [None]:
submission_test_prob = np.clip(submission_test_prob, 0.0005, 0.99)
submission_test_prob.iloc[submission_test.query('cp_type == "ctl_vehicle"').index] = 0.0

In [None]:
submission_test_prob.to_csv('C:/Users/adwai/Desktop/EAI 6000/submission.csv')
#C:/Users/hp/Desktop/EAI 6000 Project/lish-moa/submission.csv

In [None]:
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
SEED = 255

moa_df     = pd.read_csv("C:/Users/adwai/Desktop/EAI 6000/lish-moa/train_features.csv", index_col = 0)
moa_labels = pd.read_csv("C:/Users/adwai/Desktop/EAI 6000/lish-moa/train_targets_scored.csv", index_col = 0)

moa_df['cp_type'] = moa_df['cp_type'].map({"trt_cp" : 1, "ctl_vehicle" : 0})
moa_df['cp_time'] = moa_df['cp_time'].map({24 : 0, 48 : 0.5, 72 : 1})
moa_df['cp_dose'] = moa_df['cp_dose'].map({'D1' : 0, 'D2' : 1})

scaler = MinMaxScaler()
moa_df.iloc[:,3:] = scaler.fit_transform(moa_df.iloc[:,3:])
    
X = moa_df.values
y = moa_labels.values

PARAMS = {
    "n_estimators": 767,
    "gamma": 1,
    "alpha": 0.5723441911553176,
    "lambda": 2,
    "max_depth": 8,
    "subsample": 0.8072479047241428,
    "learning_rate": 0.009879006755753497,
    "min_child_weight": 7,
    "max_delta_step": 10
}

clf = OneVsRestClassifier(XGBClassifier(tree_method = 'gpu_hist', 
                                        gpu_id = 0,
                                        objective = "binary:logistic",
                                        random_state = SEED,
                                        **PARAMS))


clf.fit(X,y)

test   = pd.read_csv("C:/Users/adwai/Desktop/EAI 6000/lish-moa/test_features.csv", index_col = 0)

test['cp_type'] = test['cp_type'].map({"trt_cp" : 1, "ctl_vehicle" : 0})
test['cp_time'] = test['cp_time'].map({24 : 0, 48 : 0.5, 72 : 1})
test['cp_dose'] = test['cp_dose'].map({'D1' : 0, 'D2' : 1})
test.iloc[:,3:] = scaler.transform(test.iloc[:,3:])
    
X_test = test.values

y_pred = clf.predict_proba(X_test)

submission = pd.DataFrame(y_pred, index = test.index, columns = moa_labels.columns)
submission.head()

submission.to_csv("C:/Users/adwai/Desktop/EAI 6000/lish-moa/submission.csv")