In [1]:
# example of a super learner model for binary classification
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
#create a list of base models
def get_models():
  models = []
  models.append(SVC(kernel='rbf',probability=True,random_state=0))
  models.append(XGBClassifier(max_depth=9, eta = 0.4, gamma = 3, n_rounds = 100))
  return models

In [3]:
# collect out of fold predictions form k-fold cross validation
def get_out_of_fold_predictions(X, y, models):
	meta_X, meta_y = list(), list()
	# define split of data
	kfold = KFold(n_splits=10, shuffle=True)
	# enumerate splits
	for train_ix, test_ix in kfold.split(X):
		fold_yhats = list()
		# get data
		train_X, test_X = X[train_ix], X[test_ix]
		train_y, test_y = y[train_ix], y[test_ix]
		meta_y.extend(test_y)
		# fit and make predictions with each sub-model
		for model in models:
			model.fit(train_X, train_y)
			yhat = model.predict_proba(test_X)
			# store columns
			fold_yhats.append(yhat)
		# store fold yhats as columns
		meta_X.append(hstack(fold_yhats))
	return vstack(meta_X), asarray(meta_y)

In [4]:
# fit all base models on the training dataset
def fit_base_models(X, y, models):
	for model in models:
		model.fit(X, y)

In [5]:
# fit a meta model
def fit_meta_model(X, y):
	model = LogisticRegression(solver='liblinear')
	model.fit(X, y)
	return model

In [6]:
# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
	meta_X = list()
	for model in models:
		yhat = model.predict_proba(X)
		meta_X.append(yhat)
	meta_X = hstack(meta_X)
	# predict
	return meta_model.predict_proba(meta_X)

In [7]:
"""#Upload the dataset here
from google.colab import files
uploaded = files.upload()"""

'#Upload the dataset here\nfrom google.colab import files\nuploaded = files.upload()'

In [8]:
import io
import pandas as pd
df = pd.read_csv('X_trainData_column_modified_PZA.csv') #Enter the file name here.
# Dataset is now stored in a Pandas Dataframe

In [9]:
df

Unnamed: 0,DEL_CF_410280_d918T_307_iniB,DEL_CF_4408101_d102C_34_gid,DEL_F_4408101_d101C_gid_G34G,INS_CF_4242820_i2957G_986_embC,INS_F_409772_i410ATCT_iniB_G137G,INS_F_4247020_i506CC_embB_G169G,INS_F_4247970_i1456GT_embB_G486G,INS_N_4243642_i409GTCCCGGGGCGCCAC_embA_S137S,INS_P_3074519_G.117_thyA,INS_P_3074521_G.115_thyA,SNP_CN_1673449_A10C_T4P_fabG1,SNP_CN_1674263_T62C_I21T_inhA,SNP_CN_1674481_T280G_S94A_inhA,SNP_CN_1674952_C751G_P251A_inhA,SNP_CN_2102891_A152G_F51S_ndh,SNP_CN_2102990_A53G_V18A_ndh,SNP_CN_2154613_TC_katG_Q500R,SNP_CN_2154724_C1388A_R463L_katG,SNP_CN_2155167_GT_katG_S315R,SNP_CN_2155168_C944G_S315T_katG,SNP_CN_2155168_C944T_S315N_katG,SNP_CN_2155276_CT_katG_G279D,SNP_CN_2289040_AC_pncA_W68G,SNP_CN_2289090_TC_pncA_H51R,SNP_CN_2289099_TG_pncA_K48T,SNP_CN_2289180_A62C_V21G_pncA,SNP_CN_2289202_A40G_C14R_pncA,SNP_CN_2518836_AC_kasA_E241A,SNP_CN_2518839_CT_kasA_A242V,SNP_CN_2518919_G805A_G269S_kasA,SNP_CN_2519048_G934A_G312S_kasA,SNP_CN_2726051_G37A_L13F_oxyR',SNP_CN_2726338_T146G_V49G_ahpC,SNP_CN_2726338_TGGT146-149GGGG_VV49-50GG_ahpC,SNP_CN_2726350_G158T_W53L_ahpC,SNP_CN_2726409_GC_ahpC_D73H,SNP_CN_2726737_A545C_D182A_ahpC,SNP_CN_3073805_CA_thyA_E223*,SNP_CN_3073868_T604C_T202A_thyA,SNP_CN_3074465_GC_thyA_P3A,...,pncA_C_snp,eis_NC_indel,ethA_C_snp,katG_F_indel,iniB_F_indel,gyrA-gyrB_NC_snp,eis_C_snp,iniA-iniB-iniC_NC_snp,embA-embB_NC_snp,rpoB_F_indel,alr_C_snp,rpsA_NF_indel,ddl_C_snp,embB_C_snp,rpsL_NC_snp,embA_C_snp,kasA_C_snp,rpoB_C_snp,iniC_C_snp,gyrB_C_snp,iniA_F_indel,eis_NF_indel,kasA_NF_indel,ndh_NC_snp,rrl_F_indel,pncA_F_indel,fabG1-inhA_NC_snp,rpsA_NC_snp,embA_F_indel,thyA_C_snp,ahpC_NC_snp,ahpC_C_snp,gid_C_snp,iniA_C_snp,gyrB_F_indel,embC_C_snp,eis_NC_snp,rpsA_C_snp,rrl-rrs_NC_snp,PZA
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,1,1,0,0,1,1,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2936,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2938,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2939,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [10]:
X = df.iloc[:,:-1].values
X.shape

(2941, 219)

In [11]:
y = df.iloc[:,-1].values
y.shape

(2941,)

In [12]:
# get models
models = get_models()
# get out of fold predictions
meta_X, meta_y = get_out_of_fold_predictions(X, y, models)
print('Meta ', meta_X.shape, meta_y.shape)

Meta  (2941, 4) (2941,)


In [13]:
# fit base models
fit_base_models(X, y, models)

In [14]:
# fit the meta model
meta_model = fit_meta_model(meta_X, meta_y)

In [15]:
"""#Upload the dataset here
from google.colab import files
uploaded = files.upload()"""

'#Upload the dataset here\nfrom google.colab import files\nuploaded = files.upload()'

In [16]:
X_test = pd.read_csv('final_X_testData_column_modified_PZA.csv') #Enter the file name here.
# Dataset is now stored in a Pandas Dataframe

In [17]:
import numpy as np

In [18]:
X_test

Unnamed: 0,DEL_CF_410280_d918T_307_iniB,DEL_CF_4408101_d102C_34_gid,DEL_F_4408101_d101C_gid_G34G,INS_CF_4242820_i2957G_986_embC,INS_F_409772_i410ATCT_iniB_G137G,INS_F_4247020_i506CC_embB_G169G,INS_F_4247970_i1456GT_embB_G486G,INS_N_4243642_i409GTCCCGGGGCGCCAC_embA_S137S,INS_P_3074519_G.117_thyA,INS_P_3074521_G.115_thyA,SNP_CN_1673449_A10C_T4P_fabG1,SNP_CN_1674263_T62C_I21T_inhA,SNP_CN_1674481_T280G_S94A_inhA,SNP_CN_1674952_C751G_P251A_inhA,SNP_CN_2102891_A152G_F51S_ndh,SNP_CN_2102990_A53G_V18A_ndh,SNP_CN_2154613_TC_katG_Q500R,SNP_CN_2154724_C1388A_R463L_katG,SNP_CN_2155167_GT_katG_S315R,SNP_CN_2155168_C944G_S315T_katG,SNP_CN_2155168_C944T_S315N_katG,SNP_CN_2155276_CT_katG_G279D,SNP_CN_2289040_AC_pncA_W68G,SNP_CN_2289090_TC_pncA_H51R,SNP_CN_2289099_TG_pncA_K48T,SNP_CN_2289180_A62C_V21G_pncA,SNP_CN_2289202_A40G_C14R_pncA,SNP_CN_2518836_AC_kasA_E241A,SNP_CN_2518839_CT_kasA_A242V,SNP_CN_2518919_G805A_G269S_kasA,SNP_CN_2519048_G934A_G312S_kasA,SNP_CN_2726051_G37A_L13F_oxyR',SNP_CN_2726338_T146G_V49G_ahpC,SNP_CN_2726338_TGGT146-149GGGG_VV49-50GG_ahpC,SNP_CN_2726350_G158T_W53L_ahpC,SNP_CN_2726409_GC_ahpC_D73H,SNP_CN_2726737_A545C_D182A_ahpC,SNP_CN_3073805_CA_thyA_E223*,SNP_CN_3073868_T604C_T202A_thyA,SNP_CN_3074465_GC_thyA_P3A,...,pncA_C_snp,eis_NC_indel,ethA_C_snp,katG_F_indel,iniB_F_indel,gyrA-gyrB_NC_snp,eis_C_snp,iniA-iniB-iniC_NC_snp,embA-embB_NC_snp,rpoB_F_indel,alr_C_snp,rpsA_NF_indel,ddl_C_snp,embB_C_snp,rpsL_NC_snp,embA_C_snp,kasA_C_snp,rpoB_C_snp,iniC_C_snp,gyrB_C_snp,iniA_F_indel,eis_NF_indel,kasA_NF_indel,ndh_NC_snp,rrl_F_indel,pncA_F_indel,fabG1-inhA_NC_snp,rpsA_NC_snp,embA_F_indel,thyA_C_snp,ahpC_NC_snp,ahpC_C_snp,gid_C_snp,iniA_C_snp,gyrB_F_indel,embC_C_snp,eis_NC_snp,rpsA_C_snp,rrl-rrs_NC_snp,PZA
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,?
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,?
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,?
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,?
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
768,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,?
769,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,?
770,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,?
771,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,?


In [19]:
X_val = X_test.iloc[:,:-1].values
X_val.shape

(773, 219)

In [20]:
yhat = super_learner_predictions(X_val, models, meta_model)


In [21]:
yhat

array([[0.03191274, 0.96808726],
       [0.88544434, 0.11455566],
       [0.21912904, 0.78087096],
       ...,
       [0.02991633, 0.97008367],
       [0.02781514, 0.97218486],
       [0.02909031, 0.97090969]])

In [22]:
y_hat_class_1 = yhat[:,1]

In [23]:
y_hat_class_1.shape

(773,)

In [24]:
result = pd.DataFrame(yhat)
result = result.iloc[:,1]

In [25]:
result.to_csv('PZA_results.csv',index = False)