In [1]:
# importing libraries
import numpy as np												# to deal with numpy arrays
import pandas as pd												# to work with dataframes
import math														# if needed
import statistics												# to get mean and stdev
import matplotlib.pyplot as plt									# used for plots
import seaborn as sns											# for violin curves
from sklearn.preprocessing import LabelEncoder					# encoding
from sklearn.preprocessing import MinMaxScaler, StandardScaler  # scaling
from ucimlrepo import fetch_ucirepo 							# to import 2nd dataset


In [2]:
# The function takes W and X, dot product them and pass them via a sigmoid function
def hypothesis(train_X, W):
	result = np.dot(train_X, W.T) 		# for 2D arrays, np.dot returns matrix multiplication
	result = 1 / (1+np.exp(-result)) 	# sigmoid function
	return result

In [3]:
# The function converts required columns to Label encoded columns and gets the columns for one-hot encoding
def Labeling(Features):
	categorical_columns = []
	for col in Features.columns:
		if (not pd.api.types.is_numeric_dtype(Features[col])):
			if (Features[col].nunique() >= 3):
				categorical_columns.append(col)
			else:
				encoder = LabelEncoder()
				Features[col] = encoder.fit_transform(Features[col])
				print(col)
	
	print(categorical_columns)
	# Convert each column to categorical type
	for col in categorical_columns:
		Features[col] = Features[col].astype('category')
	# One-hot encode the data using pandas get_dummies
	Features = pd.get_dummies(Features, dtype=int)
	return Features, categorical_columns
	

In [4]:
# The function does scaling based on preference (1: MinMax Scaling, 2: Standard Scaling)
def scaling(dataframe, preference, categorical_columns):
	one_hot_columns=[]
	for col in dataframe.columns:
		for name in categorical_columns: # categorical columns are excluded from scaling due to 
			now = str(name)+"_"			 # increased efficiency
			if (now in col):
				one_hot_columns.append(col)
				break
	
	other_columns = []
	for col in dataframe.columns:
		if col not in one_hot_columns:
			other_columns.append(col)
	if preference == 1:
		scaler = MinMaxScaler()
		dataframe[other_columns] = scaler.fit_transform(dataframe[other_columns])
		return dataframe
	else:
		scaler = StandardScaler()
		dataframe[other_columns] = scaler.fit_transform(dataframe[other_columns])
		return dataframe

In [5]:
# The functions takes a W vector(LR Model) and Test Data to return accuracy, probabilistic predictions and binary predictions
def get_accuracy_results(test_X, test_Y, W):
	results = hypothesis(test_X, W) # floating point numbers
	predictions = np.zeros((test_X.shape[0],1))
	for i in range(test_X.shape[0]):
		if results[i] >= 0.5:		# Threashold is taken as 0.5
			predictions[i]=1
		else:
			predictions[i]=0
	
	correct_predictions = np.sum(predictions == test_Y)
	accuracy = correct_predictions/len(test_Y)
	return accuracy, predictions, results

In [6]:
# For cross validation, we need to calculate loss for validation set
def get_loss(test_Y, Predictions):
	squared_diff = np.square(test_Y-Predictions)
	mean_squared_loss = np.mean(squared_diff)
	return mean_squared_loss

In [7]:
# Base Class - Logistic Regression Classifier
# Parameters:	
#				alpha - Learning rate
#				iterations - Number of iterations
#				batch size - takes as 1000 for Minibatch 
#				model seed - so that init W is initalized as same as experients

def logistic_regression(train_X, train_Y, model_seed, alpha=0.0005, iterations=300, batch_size=1000):
	number_of_records, number_of_features = train_X.shape
	np.random.seed(seed=model_seed)
	init_W = np.random.rand(1, number_of_features)  # Initializing model weights


	start = 0
	endIdx = start+batch_size

	for i in range(iterations):
		batch_X = train_X[start:endIdx]
		batch_Y = train_Y[start:endIdx]

		now = hypothesis(batch_X, init_W) # here we get the probabilities
		ones_arr = np.ones((endIdx-start,1)) 		# Used in formula

		for j in range(0, number_of_features):
			batch_X_j = batch_X[:,j].reshape(-1,1)
			before_val = init_W[:,j]
			
			#tmp = np.multiply(now,(ones_arr-now))
			tmp = ones_arr
			here = np.multiply((batch_Y-now), (np.multiply(tmp,(batch_X_j))))
			
			
			init_W[:,j] = before_val + alpha* here.sum()


		start=endIdx%number_of_records
		endIdx=(start+batch_size)
		if endIdx >= number_of_records:
			endIdx = number_of_records
		
		# norm = np.linalg.norm(init_W)
		# if norm != 0:
		# 	init_W = init_W/norm

	return init_W


In [8]:
# 2 hyperparameters are tuned via cross validation 
# alpha - Learning Rate
# itr - Number of iteration through which model can fit
def cross_validation(train_X, train_Y, validation_X, validation_Y, model_seed):
	alpha_list = [0.0001, 0.0005, 0.0007, 0.001, 0.005, 0.007]
	iterations_cadidate = [10, 50, 70, 100, 150, 200, 300, 450]

	loss = np.inf
	optimal_alpha = 0.0001
	optimal_itr = 50
	optimal_W = []

	for alpha in alpha_list:
		for itr in iterations_cadidate:
			W = logistic_regression(train_X, train_Y, model_seed, alpha, itr, 1000)
			Predictions_on_validation = hypothesis(validation_X, W)
			loss_on_validation = get_loss(validation_Y, Predictions_on_validation)
			#print("alpha: ", alpha, "itr: ", itr, "loss: ", loss_on_validation)
			#print(W)
			if loss_on_validation < loss:
				loss = loss_on_validation
				optimal_alpha = alpha
				optimal_itr = itr
				optimal_W = W

	return optimal_W, optimal_alpha, optimal_itr

In [9]:
# my custom dataset splitting. 2nd set is splitted according to ratio
def dataset_splitting(dataset_X, dataset_Y, ratio, splitting_seed=90):
	total_samples = dataset_X.shape[0]
	test_size = int(total_samples * ratio)

	all_indices = np.arange(total_samples)

	np.random.seed(splitting_seed)
	np.random.shuffle(all_indices)

	test_indices = all_indices[:test_size]
	training_indices = all_indices[test_size:]

	train_X = dataset_X.iloc[training_indices].to_numpy()
	train_Y = dataset_Y.iloc[training_indices].to_numpy()

	test_X = dataset_X.iloc[test_indices].to_numpy()
	test_Y = dataset_Y.iloc[test_indices].to_numpy()

	return train_X, train_Y, test_X, test_Y

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

# to get different performance parameters 
# for auroc and aupr, predictions are needed rather than binary_predictions

def getParameters(truth_labels, binary_predictions, predictions):
	tn, fp, fn, tp = confusion_matrix(truth_labels, binary_predictions).ravel()
	#print("tn: ", tn, "fp: ", fp, "fn: ", fn, "tp: ", tp)
	total_positive_predictions = tp +fp
	precision = 0
	if total_positive_predictions != 0:
		precision = tp / (tp + fp)
	recall = tp / (tp + fn)
	f1_score = 0
	if (precision+recall) != 0:
		f1_score = 2 * (precision * recall) / (precision + recall)
	specificity = tn / (tn + fp)
	auroc = roc_auc_score(truth_labels, predictions)
	aupr = average_precision_score(truth_labels, predictions)

	return recall, specificity, precision, f1_score, auroc, aupr

In [11]:
# to get different mean values for the 9 LR learners performnace metrics
def get_mean_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list):
	accuracy_mean = statistics.mean(accuracy_list)
	sensitivity_mean = statistics.mean(sensitivity_list)
	specificity_mean = statistics.mean(specificity_list)
	precision_mean = statistics.mean(precision_list)
	f1_score_mean = statistics.mean(f1_score_list)
	auroc_mean = statistics.mean(auroc_list)
	aupr_mean = statistics.mean(aupr_list)
	return accuracy_mean, sensitivity_mean, specificity_mean, precision_mean, f1_score_mean, auroc_mean, aupr_mean


In [12]:
# to get different standard deviation values for the 9 LR learners performnace metrics
def get_stdev_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list):
	accuracy_stdev = statistics.stdev(accuracy_list)
	sensitivity_stdev = statistics.stdev(sensitivity_list)
	specificity_stdev = statistics.stdev(specificity_list)
	precision_stdev = statistics.stdev(precision_list)
	f1_score_stdev = statistics.stdev(f1_score_list)
	auroc_stdev = statistics.stdev(auroc_list)
	aupr_stdev = statistics.stdev(aupr_list)
	return accuracy_stdev, sensitivity_stdev, specificity_stdev, precision_stdev, f1_score_stdev, auroc_stdev, aupr_stdev


In [13]:
# to draw violin plots using sns library
def plot_violins(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list, number):
	data = {
    'Accuracy': accuracy_list,
    'Sensitivity': sensitivity_list,
    'Specificity': specificity_list,
    'Precision': precision_list,
    'F1 Score': f1_score_list,
    'AUROC': auroc_list,
    'AUPR': aupr_list
	}
	plt.figure(figsize=(12, 8), dpi=300)  # High resolution plot
	sns.violinplot(data=list(data.values()), inner="quart",palette="Set3")
	plt.xticks(range(len(data)), list(data.keys()), fontsize=12)
	plt.xlabel("Metrics", fontsize=14)
	plt.ylabel("Values", fontsize=14)
	plt.title("Violin Plots for Each Metric", fontsize=16)
	plt.savefig(f'violin_plot_dataset_{number}.png')
	plt.show()


In [14]:
main_list = []
main_list.append(['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'F1 Score', 'AUROC', 'AUPR'])

In [None]:
# ---------------- 	PREPROCESSING FOR DATASET-1 -----------------------
dataframe = pd.read_csv("dataset_1.csv")
dataframe.drop('customerID', axis=1, inplace=True)
dataframe['TotalCharges'] = pd.to_numeric(dataframe['TotalCharges'], errors='coerce')
dataframe['TotalCharges'].fillna(0)
# Replacing any null attributes with the mean of that attribute column
dataframe.fillna(dataframe.mean(numeric_only=True), inplace=True)
dataframe.drop_duplicates(inplace=True)
dataframe.dropna(subset=['Churn'], inplace=True)
for column in dataframe.columns:
    how_many = dataframe[column].isnull().sum()
    if how_many != 0:
            dataframe[column]=dataframe[column].fillna(dataframe[column].mode()[0])


Features = dataframe.drop('Churn', axis=1)
Labels = dataframe['Churn']

Features, categorical_columns = Labeling(Features)

encoder = LabelEncoder()
labels_col = encoder.fit_transform(Labels)
Labels_df = pd.DataFrame(labels_col, columns=['Churn'])

Features_1 = scaling(Features,1, categorical_columns)
# correlation analysis of features with target
zero_var_columns=[]
for col in Features_1.columns:
	variance = Features_1[col].var()
	if variance == 0:
		zero_var_columns.append(col)

target_labels = Labels_df['Churn']
Features_1_cleaned = Features_1.drop(columns=zero_var_columns)
correlations = Features_1_cleaned.corrwith(target_labels)

sorted_correlations = correlations.abs().sort_values(ascending=False)
top_20_columns = sorted_correlations.head(20).index	 # 20 features show good amount of correlations
X = Features_1_cleaned[top_20_columns]
y = Labels_df['Churn']
X.to_csv("Dataset_X_1.csv")
y.to_csv("Dataset_Y_1.csv")

In [None]:
# to fetch the adult dataset from the link
adult = fetch_ucirepo(id=2) 
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
df = pd.DataFrame(X)
df2 = pd.DataFrame(y)
income_list = []
for i in range(0, df2.shape[0]):
	if '>50K' in df2.iloc[i,0]:
		income_list.append(1)
	else:
		income_list.append(0)
len(income_list)
df3 = pd.DataFrame(income_list, columns=['income'])
df_combined = pd.concat([df, df3], axis=1)
print(df_combined.shape)
df_combined.to_csv("dataset_2.csv")

In [None]:
# ----------------------- Dataset-2 PREPROCESSING ----------------------
dataframe = pd.read_csv("dataset_2.csv").iloc[:, 1:]
dataframe.drop_duplicates(inplace=True)
dataframe.dropna(subset=['income'], inplace=True)
for column in dataframe.columns:
    how_many = dataframe[column].isnull().sum()
    if how_many != 0:
            dataframe[column]=dataframe[column].fillna(dataframe[column].mode()[0])

dataframe.isnull().sum()
dataframe['workclass'].fillna(dataframe['workclass'].mode()[0])
Features = dataframe.drop('income', axis=1)
Labels = dataframe['income']
Features, categorical_columns = Labeling(Features)
labels_col = encoder.fit_transform(Labels)
Labels_df = pd.DataFrame(labels_col, columns=['income'])

Features_1 = scaling(Features,1, categorical_columns)
# correlation analysis of features with target
zero_var_columns=[]
for col in Features_1.columns:
	variance = Features_1[col].var()
	if variance == 0:
		print(col)
		zero_var_columns.append(col)


zero_var_columns
target_labels = Labels_df['income']
Features_1_cleaned = Features_1.drop(columns=zero_var_columns)
correlations = Features_1_cleaned.corrwith(target_labels)

sorted_correlations = correlations.abs().sort_values(ascending=False)
top_20_columns = sorted_correlations.head(20).index
X = Features_1_cleaned[top_20_columns]
y = Labels_df['income']

X.to_csv("Dataset_X_2.csv")
y.to_csv("Dataset_Y_2.csv")

In [None]:
dataframe = pd.read_csv("creditcard.csv")
dataframe.drop_duplicates(inplace=True)
dataframe.dropna(subset=['Class'], inplace=True)

Features = dataframe.drop('Class', axis=1)
Labels = dataframe['Class']

Features, categorical_columns = Labeling(Features)

encoder = LabelEncoder()
labels_col = encoder.fit_transform(Labels)
Labels_df = pd.DataFrame(labels_col, columns=['Class'])

Features_1 = scaling(Features,1,categorical_columns)
# correlation analysis of features with target
zero_var_columns=[]
for col in Features_1.columns:
	variance = Features_1[col].var()
	if variance == 0:
		zero_var_columns.append(col)

target_labels = Labels_df['Class']
Features_1_cleaned = Features_1.drop(columns=zero_var_columns)
correlations = Features_1_cleaned.corrwith(target_labels)

sorted_correlations = correlations.abs().sort_values(ascending=False)
top_20_columns = sorted_correlations.head(20).index
X = Features_1_cleaned[top_20_columns]
y = Labels_df['Class']

X.to_csv("Dataset_X_3.csv")
y.to_csv("Dataset_Y_3.csv")



In [None]:
temp_dataset_X = pd.read_csv("Dataset_X_1.csv", skiprows = 1).iloc[: , 1:]
dataset_Y = pd.read_csv("Dataset_Y_1.csv", skiprows = 1).iloc[: , 1:]
dummy_ones_column = np.ones((temp_dataset_X.shape[0],1)) # this creates a n*1 matrix of 1's
dataset_X = np.hstack((dummy_ones_column, temp_dataset_X)) # (n, m+1)
dataset_X=pd.DataFrame(dataset_X)
init_train_X_1, init_train_Y_1, test_X_1, test_Y_1  = dataset_splitting(dataset_X, dataset_Y, 0.2, 96)
train_X_1, train_Y_1, validation_X_1, validation_Y_1 = dataset_splitting(pd.DataFrame(init_train_X_1), pd.DataFrame(init_train_Y_1), 0.2, 85)
W_1, alpha_1, itr_1 = cross_validation(train_X_1, train_Y_1, validation_X_1, validation_Y_1, 48)
accu, binary_predictions, predictions= get_accuracy_results(test_X_1, test_Y_1, W_1)
print("accuracy: ", accu, "optimal_alpha: ", alpha_1, "optimal_iteration_num: ", itr_1)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(test_Y_1, binary_predictions, predictions)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accu, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
# ------------------------------- 9 LR MODELS FOR BAGGING -----------------------
from sklearn.metrics import recall_score

all_indices = np.arange(dataset_X.shape[0])
training_size = (int)(0.80 * dataset_X.shape[0])		# 80% training set
np.random.seed(48)
training_indices = np.random.choice(all_indices, size=training_size, replace=False)
test_indices=[]

for i in all_indices:
	if i not in training_indices:
		test_indices.append(i)

test_labels = dataset_Y.iloc[test_indices].to_numpy()
W_list = []
validation_X_list =[]
validation_Y_list =[]
seeds = [27,20,30,74,56,76,56,78,90,10,23,49,56]
splitting_seeds = [96, 98, 32, 54, 10, 15, 28, 95, 12, 39, 48, 68, 71]

Testset_X = dataset_X.iloc[test_indices]
Testset_Y = dataset_Y.iloc[test_indices]

for i in range(0,9):
	
	np.random.seed(seeds[i])
	random_indices = np.random.choice(training_indices, size=(training_size), replace=True)

	# Training + Validation sets ----------------
	dataset_X_i = dataset_X.iloc[random_indices]
	dataset_Y_i = dataset_Y.iloc[random_indices]
	# Training + Validation sets ----------------

	train_X, train_Y, validation_X, validation_Y = dataset_splitting(dataset_X_i, dataset_Y_i, 0.2, splitting_seeds[i])
	validation_X_list.append(validation_X)
	validation_Y_list.append(validation_Y)
	W, alpha, itr = cross_validation(train_X, train_Y, validation_X, validation_Y, seeds[i])
	W_list.append(W) 


# ----------- We have 9 different LR models (W's) -------------
Binary_Prediction_list =[]
Prediction_list =[]
accuracy_list = []
sensitivity_list=[]
specificity_list=[]
precision_list=[]
f1_score_list=[]
auroc_list=[]
aupr_list=[]

final_predictions = np.zeros((Testset_Y.shape[0],1))
for i in range(0, 9):
	accuracy, binary_predictions, predictions = get_accuracy_results(Testset_X.to_numpy(), Testset_Y.to_numpy(), W_list[i])
	
	print(accuracy)

	Binary_Prediction_list.append(binary_predictions)
	Prediction_list.append(predictions)

	final_predictions=final_predictions+binary_predictions

	accuracy_list.append(accuracy)
	sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(Testset_Y, binary_predictions, predictions)
	print(sensitivity, specificity, precision, f1_score, auroc, aupr)
	sensitivity_list.append(sensitivity)
	specificity_list.append(specificity)
	precision_list.append(precision)
	f1_score_list.append(f1_score)
	auroc_list.append(auroc)
	aupr_list.append(aupr)
	


for i in range(len(final_predictions)):
	if final_predictions[i] >=5:
		final_predictions[i]=1
	else:
		final_predictions[i]=0


correct_predictions = np.sum(final_predictions == Testset_Y.to_numpy())
accuracy = correct_predictions / Testset_Y.shape[0]
sensitivity = recall_score(Testset_Y, final_predictions)
print("Final Accuracy:", accuracy)
print("final Sensitivity: ", sensitivity)
print(sensitivity_list)

stacked_predictions_1 = np.hstack(Prediction_list)
mean_predictions = np.mean(stacked_predictions_1, axis=1)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(Testset_Y, final_predictions, mean_predictions)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accuracy, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
accu_mean,sens_mean, spec_mean, pre_mean, f1_mean, auroc_mean, aupr_mean = get_mean_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list)
accu_std,sens_std, spec_std, pre_std, f1_std, auroc_std, aupr_std = get_stdev_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list)
print("Mean Scores for Bagging:")
print("Accuracy: ", accu_mean, "Sensitivity: ", sens_mean, "Specificity: ", spec_mean, "Precision: ", pre_mean, "F1_score: ", f1_score, "AUROC: ", auroc_mean, "AUPR: ", aupr_mean)
print("Standard Deviations of Metrics for Bagging:")
print("Accuracy: ", accu_std, "Sensitivity: ", sens_std, "Specificity: ", spec_std, "Precision: ", pre_std, "F1_score: ", f1_std, "AUROC: ", auroc_std, "AUPR: ", aupr_std)
main_list.append([accu_mean,sens_mean, spec_mean, pre_mean, f1_mean, auroc_mean, aupr_mean])
main_list.append([accu_std,sens_std, spec_std, pre_std, f1_std, auroc_std, aupr_std])

In [None]:
plot_violins(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list,1)

In [None]:
# preparing the validation data for input in stacking
df_X = pd.DataFrame(np.vstack(validation_X_list))
df_Y = pd.DataFrame(np.vstack(validation_Y_list),columns=["Y"])
merged_dataframe = pd.concat([df_X, df_Y], axis=1)
merged_dataframe = merged_dataframe.drop_duplicates()
final_X = merged_dataframe.drop(columns=["Y"]).to_numpy() 
final_Y = merged_dataframe["Y"].to_numpy().reshape(-1,1)
print("Final X shape:", final_X.shape)
print("Final Y shape:", final_Y.shape)

In [None]:
dataset_stacking_X = pd.DataFrame(final_X)

for i in range(0,9):
	accuracy, predictions,_ = get_accuracy_results(final_X, final_Y, W_list[i])
	df_temp = pd.DataFrame(predictions)
	dataset_stacking_X = pd.concat([dataset_stacking_X, df_temp], axis=1)

dataset_stacking_Y = pd.DataFrame(final_Y)
print(dataset_stacking_X.shape)
print(dataset_stacking_Y.shape)

In [None]:
init_train_X_stacking_1, init_train_Y_stacking_1, test_X_stacking_1, test_Y_stacking_1  = dataset_splitting(dataset_stacking_X, dataset_stacking_Y, 0.2, 96)
train_X_stacking_1, train_Y_stacking_1, validation_X_stacking_1, validation_Y_stacking_1 = dataset_splitting(pd.DataFrame(init_train_X_stacking_1), pd.DataFrame(init_train_Y_stacking_1), 0.2, 34)
W_stacking_1, alpha_stacking_1, itr_stacking_1 = cross_validation(train_X_stacking_1, train_Y_stacking_1, validation_X_stacking_1, validation_Y_stacking_1, 32)
accu_stacking_1, binary_predictions_stacking_1, predictions_stacking_1 = get_accuracy_results(test_X_stacking_1, test_Y_stacking_1, W_stacking_1)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(test_Y_stacking_1, binary_predictions_stacking_1, predictions_stacking_1)
print("accuracy: ", accu_stacking_1, "optimal_alpha: ", alpha_stacking_1, "optimal_iteration_num: ", itr_stacking_1)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accu_stacking_1, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
temp_dataset_X = pd.read_csv("Dataset_X_2.csv", skiprows = 1).iloc[: , 1:]
dataset_Y = pd.read_csv("Dataset_Y_2.csv", skiprows = 1).iloc[: , 1:]
dummy_ones_column = np.ones((temp_dataset_X.shape[0],1)) # this creates a n*1 matrix of 1's
dataset_X = np.hstack((dummy_ones_column, temp_dataset_X)) # (n, m+1)
dataset_X=pd.DataFrame(dataset_X)
init_train_X_1, init_train_Y_1, test_X_1, test_Y_1  = dataset_splitting(dataset_X, dataset_Y, 0.2, 96)
train_X_1, train_Y_1, validation_X_1, validation_Y_1 = dataset_splitting(pd.DataFrame(init_train_X_1), pd.DataFrame(init_train_Y_1), 0.2, 85)
W_1, alpha_1, itr_1 = cross_validation(train_X_1, train_Y_1, validation_X_1, validation_Y_1, 48)
accu, binary_predictions, predictions= get_accuracy_results(test_X_1, test_Y_1, W_1)
print("accuracy: ", accu, "optimal_alpha: ", alpha_1, "optimal_iteration_num: ", itr_1)
print(dataset_Y.shape)
print(dataset_X.shape)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(test_Y_1, binary_predictions, predictions)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accu, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
# ------------------------------- 9 LR MODELS FOR BAGGING -----------------------
from sklearn.metrics import recall_score

all_indices = np.arange(dataset_X.shape[0])
training_size = (int)(0.80 * dataset_X.shape[0])		# 80% training set
np.random.seed(48)
training_indices = np.random.choice(all_indices, size=training_size, replace=False)
test_indices=[]

for i in all_indices:
	if i not in training_indices:
		test_indices.append(i)

test_labels = dataset_Y.iloc[test_indices].to_numpy()
W_list = []
validation_X_list =[]
validation_Y_list =[]
seeds = [27,20,30,74,56,76,56,78,90,10,23,49,56]
splitting_seeds = [96, 98, 32, 54, 10, 15, 28, 95, 12, 39, 48, 68, 71]

Testset_X = dataset_X.iloc[test_indices]
Testset_Y = dataset_Y.iloc[test_indices]

for i in range(0,9):
	
	np.random.seed(seeds[i])
	random_indices = np.random.choice(training_indices, size=(training_size), replace=True)

	# Training + Validation sets ----------------
	dataset_X_i = dataset_X.iloc[random_indices]
	dataset_Y_i = dataset_Y.iloc[random_indices]
	# Training + Validation sets ----------------

	train_X, train_Y, validation_X, validation_Y = dataset_splitting(dataset_X_i, dataset_Y_i, 0.2, splitting_seeds[i])
	validation_X_list.append(validation_X)
	validation_Y_list.append(validation_Y)
	W, alpha, itr = cross_validation(train_X, train_Y, validation_X, validation_Y, seeds[i])
	W_list.append(W) 


# ----------- We have 9 different LR models (W's) -------------
Binary_Prediction_list =[]
Prediction_list =[]
accuracy_list = []
sensitivity_list=[]
specificity_list=[]
precision_list=[]
f1_score_list=[]
auroc_list=[]
aupr_list=[]

final_predictions = np.zeros((Testset_Y.shape[0],1))
for i in range(0, 9):
	accuracy, binary_predictions, predictions = get_accuracy_results(Testset_X.to_numpy(), Testset_Y.to_numpy(), W_list[i])
	
	print(accuracy)

	Binary_Prediction_list.append(binary_predictions)
	Prediction_list.append(predictions)

	final_predictions=final_predictions+binary_predictions

	accuracy_list.append(accuracy)
	sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(Testset_Y, binary_predictions, predictions)
	print(sensitivity, specificity, precision, f1_score, auroc, aupr)
	sensitivity_list.append(sensitivity)
	specificity_list.append(specificity)
	precision_list.append(precision)
	f1_score_list.append(f1_score)
	auroc_list.append(auroc)
	aupr_list.append(aupr)
	


for i in range(len(final_predictions)):
	if final_predictions[i] >=5:
		final_predictions[i]=1
	else:
		final_predictions[i]=0


correct_predictions = np.sum(final_predictions == Testset_Y.to_numpy())
accuracy = correct_predictions / Testset_Y.shape[0]
sensitivity = recall_score(Testset_Y, final_predictions)
print("Final Accuracy:", accuracy)
print("final Sensitivity: ", sensitivity)
print(sensitivity_list)

stacked_predictions_1 = np.hstack(Prediction_list)
mean_predictions = np.mean(stacked_predictions_1, axis=1)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(Testset_Y, final_predictions, mean_predictions)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accuracy, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
accu_mean,sens_mean, spec_mean, pre_mean, f1_mean, auroc_mean, aupr_mean = get_mean_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list)
accu_std,sens_std, spec_std, pre_std, f1_std, auroc_std, aupr_std = get_stdev_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list)
print("Dataset-2 Mean Scores for Bagging:")
print("Accuracy: ", accu_mean, "Sensitivity: ", sens_mean, "Specificity: ", spec_mean, "Precision: ", pre_mean, "F1_score: ", f1_score, "AUROC: ", auroc_mean, "AUPR: ", aupr_mean)
print("Dataset-2 Standard Deviations of Metrics for Bagging:")
print("Accuracy: ", accu_std, "Sensitivity: ", sens_std, "Specificity: ", spec_std, "Precision: ", pre_std, "F1_score: ", f1_std, "AUROC: ", auroc_std, "AUPR: ", aupr_std)
main_list.append([accu_mean,sens_mean, spec_mean, pre_mean, f1_mean, auroc_mean, aupr_mean])
main_list.append([accu_std,sens_std, spec_std, pre_std, f1_std, auroc_std, aupr_std])

In [None]:
plot_violins(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list,2)

In [None]:
# preparing the validation data for input in stacking
df_X = pd.DataFrame(np.vstack(validation_X_list))
df_Y = pd.DataFrame(np.vstack(validation_Y_list),columns=["Y"])
merged_dataframe = pd.concat([df_X, df_Y], axis=1)
merged_dataframe = merged_dataframe.drop_duplicates()
final_X = merged_dataframe.drop(columns=["Y"]).to_numpy() 
final_Y = merged_dataframe["Y"].to_numpy().reshape(-1,1)
print("Final X shape:", final_X.shape)
print("Final Y shape:", final_Y.shape)

In [None]:
dataset_stacking_X = pd.DataFrame(final_X)

for i in range(0,9):
	accuracy, predictions,_ = get_accuracy_results(final_X, final_Y, W_list[i])
	df_temp = pd.DataFrame(predictions)
	dataset_stacking_X = pd.concat([dataset_stacking_X, df_temp], axis=1)

dataset_stacking_Y = pd.DataFrame(final_Y)
print(dataset_stacking_X.shape)
print(dataset_stacking_Y.shape)

In [None]:
init_train_X_stacking_2, init_train_Y_stacking_2, test_X_stacking_2, test_Y_stacking_2  = dataset_splitting(dataset_stacking_X, dataset_stacking_Y, 0.2, 90)
train_X_stacking_2, train_Y_stacking_2, validation_X_stacking_2, validation_Y_stacking_2 = dataset_splitting(pd.DataFrame(init_train_X_stacking_2), pd.DataFrame(init_train_Y_stacking_2), 0.2, 39)
W_stacking_2, alpha_stacking_2, itr_stacking_2 = cross_validation(train_X_stacking_2, train_Y_stacking_2, validation_X_stacking_2, validation_Y_stacking_2, 48)
accu_stacking_2, binary_predictions_stacking_2, predictions_stacking_2 = get_accuracy_results(test_X_stacking_2, test_Y_stacking_2, W_stacking_2)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(test_Y_stacking_2, binary_predictions_stacking_2, predictions_stacking_2)
print("accuracy: ", accu_stacking_2, "optimal_alpha: ", alpha_stacking_2, "optimal_iteration_num: ", itr_stacking_2)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accu_stacking_2, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
temp_dataset_X = pd.read_csv("Dataset_X_3.csv", skiprows = 1).iloc[: , 1:]
dataset_Y = pd.read_csv("Dataset_Y_3.csv", skiprows = 1).iloc[: , 1:]
dummy_ones_column = np.ones((temp_dataset_X.shape[0],1)) # this creates a n*1 matrix of 1's
dataset_X = np.hstack((dummy_ones_column, temp_dataset_X)) # (n, m+1)
dataset_X=pd.DataFrame(dataset_X)
init_train_X_1, init_train_Y_1, test_X_1, test_Y_1  = dataset_splitting(dataset_X, dataset_Y, 0.2, 96)
train_X_1, train_Y_1, validation_X_1, validation_Y_1 = dataset_splitting(pd.DataFrame(init_train_X_1), pd.DataFrame(init_train_Y_1), 0.2, 85)
W_1, alpha_1, itr_1 = cross_validation(train_X_1, train_Y_1, validation_X_1, validation_Y_1, 48)
accu, binary_predictions, predictions= get_accuracy_results(test_X_1, test_Y_1, W_1)
print("accuracy: ", accu, "optimal_alpha: ", alpha_1, "optimal_iteration_num: ", itr_1)
print(dataset_Y.shape)
print(dataset_X.shape)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(test_Y_1, binary_predictions, predictions)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accu, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
# ------------------------------- 9 LR MODELS FOR BAGGING -----------------------
from sklearn.metrics import recall_score

all_indices = np.arange(dataset_X.shape[0])
training_size = (int)(0.80 * dataset_X.shape[0])		# 80% training set
np.random.seed(50)
training_indices = np.random.choice(all_indices, size=training_size, replace=False)
test_indices=[]

for i in all_indices:
	if i not in training_indices:
		test_indices.append(i)

test_labels = dataset_Y.iloc[test_indices].to_numpy()

W_list = []
validation_X_list =[]
validation_Y_list =[]
seeds = [27,20,30,74,56,76,56,78,90,10,23,49,56]
splitting_seeds = [96, 98, 32, 54, 10, 15, 28, 95, 12, 39, 48, 68, 71]

Testset_X = dataset_X.iloc[test_indices]
Testset_Y = dataset_Y.iloc[test_indices]


print("TestSet_X shape:")
print(Testset_X.shape)


for i in range(0,9):
	
	np.random.seed(seeds[i])
	random_indices = np.random.choice(training_indices, size=(training_size), replace=True)

	# Training + Validation sets ----------------
	dataset_X_i = dataset_X.iloc[random_indices]
	dataset_Y_i = dataset_Y.iloc[random_indices]
	# Training + Validation sets ----------------

	train_X, train_Y, validation_X, validation_Y = dataset_splitting(dataset_X_i, dataset_Y_i, 0.2, splitting_seeds[i])
	validation_X_list.append(validation_X)
	validation_Y_list.append(validation_Y)
	W, alpha, itr = cross_validation(train_X, train_Y, validation_X, validation_Y, seeds[i])
	W_list.append(W) 


# ----------- We have 9 different LR models (W's) -------------
Binary_Prediction_list =[]
Prediction_list =[]
accuracy_list = []
sensitivity_list=[]
specificity_list=[]
precision_list=[]
f1_score_list=[]
auroc_list=[]
aupr_list=[]

final_predictions = np.zeros((Testset_Y.shape[0],1))
for i in range(0, 9):
	accuracy, binary_predictions, predictions = get_accuracy_results(Testset_X.to_numpy(), Testset_Y.to_numpy(), W_list[i])
	
	print(accuracy)

	Binary_Prediction_list.append(binary_predictions)
	Prediction_list.append(predictions)

	final_predictions=final_predictions+binary_predictions

	accuracy_list.append(accuracy)
	sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(Testset_Y, binary_predictions, predictions)
	print(sensitivity, specificity, precision, f1_score, auroc, aupr)
	sensitivity_list.append(sensitivity)
	specificity_list.append(specificity)
	precision_list.append(precision)
	f1_score_list.append(f1_score)
	auroc_list.append(auroc)
	aupr_list.append(aupr)
	


for i in range(len(final_predictions)):
	if final_predictions[i] >=5:
		final_predictions[i]=1
	else:
		final_predictions[i]=0


correct_predictions = np.sum(final_predictions == Testset_Y.to_numpy())
accuracy = correct_predictions / Testset_Y.shape[0]
sensitivity = recall_score(Testset_Y, final_predictions)
print("Final Accuracy:", accuracy)
print("final Sensitivity: ", sensitivity)
print(sensitivity_list)

stacked_predictions_1 = np.hstack(Prediction_list)
mean_predictions = np.mean(stacked_predictions_1, axis=1)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(Testset_Y, final_predictions, mean_predictions)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accuracy, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
accu_mean,sens_mean, spec_mean, pre_mean, f1_mean, auroc_mean, aupr_mean = get_mean_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list)
accu_std,sens_std, spec_std, pre_std, f1_std, auroc_std, aupr_std = get_stdev_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list)
print("Dataset-3 Mean Scores for Bagging:")
print("Accuracy: ", accu_mean, "Sensitivity: ", sens_mean, "Specificity: ", spec_mean, "Precision: ", pre_mean, "F1_score: ", f1_score, "AUROC: ", auroc_mean, "AUPR: ", aupr_mean)
print("Dataset-3 Standard Deviations of Metrics for Bagging:")
print("Accuracy: ", accu_std, "Sensitivity: ", sens_std, "Specificity: ", spec_std, "Precision: ", pre_std, "F1_score: ", f1_std, "AUROC: ", auroc_std, "AUPR: ", aupr_std)
main_list.append([accu_mean,sens_mean, spec_mean, pre_mean, f1_mean, auroc_mean, aupr_mean])
main_list.append([accu_std,sens_std, spec_std, pre_std, f1_std, auroc_std, aupr_std])




In [None]:
plot_violins(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list,3)

In [None]:
# preparing the validation data for input in stacking
df_X = pd.DataFrame(np.vstack(validation_X_list))
df_Y = pd.DataFrame(np.vstack(validation_Y_list),columns=["Y"])
merged_dataframe = pd.concat([df_X, df_Y], axis=1)
merged_dataframe = merged_dataframe.drop_duplicates()
final_X = merged_dataframe.drop(columns=["Y"]).to_numpy() 
final_Y = merged_dataframe["Y"].to_numpy().reshape(-1,1)
print("Final X shape:", final_X.shape)
print("Final Y shape:", final_Y.shape)

In [None]:
dataset_stacking_X = pd.DataFrame(final_X)

for i in range(0,9):
	accuracy, predictions,_ = get_accuracy_results(final_X, final_Y, W_list[i])
	df_temp = pd.DataFrame(predictions)
	dataset_stacking_X = pd.concat([dataset_stacking_X, df_temp], axis=1)

dataset_stacking_Y = pd.DataFrame(final_Y)
print(dataset_stacking_X.shape)
print(dataset_stacking_Y.shape)

In [None]:
init_train_X_stacking_2, init_train_Y_stacking_2, test_X_stacking_2, test_Y_stacking_2  = dataset_splitting(dataset_stacking_X, dataset_stacking_Y, 0.2, 90)
train_X_stacking_2, train_Y_stacking_2, validation_X_stacking_2, validation_Y_stacking_2 = dataset_splitting(pd.DataFrame(init_train_X_stacking_2), pd.DataFrame(init_train_Y_stacking_2), 0.2, 39)
W_stacking_2, alpha_stacking_2, itr_stacking_2 = cross_validation(train_X_stacking_2, train_Y_stacking_2, validation_X_stacking_2, validation_Y_stacking_2, 48)
accu_stacking_3, binary_predictions_stacking_2, predictions_stacking_2 = get_accuracy_results(test_X_stacking_2, test_Y_stacking_2, W_stacking_2)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(test_Y_stacking_2, binary_predictions_stacking_2, predictions_stacking_2)
print("accuracy: ", accu_stacking_3, "optimal_alpha: ", alpha_stacking_2, "optimal_iteration_num: ", itr_stacking_2)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accu_stacking_3, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [None]:
indices_where_one = dataset_Y.index[dataset_Y.iloc[:,0] == 1].tolist()

# Print the indices
print(len(indices_where_one))
other_indices=[]
for i in range(dataset_X.shape[0]):
	if i not in indices_where_one:
		other_indices.append(i)

training_size = (int)(0.8*20000) 	# 80% training set
np.random.seed(50)
all_indices = np.random.choice(other_indices, size=20000, replace=False)
all_indices = np.append(all_indices, indices_where_one)

training_indices = np.random.choice(all_indices, size=training_size, replace=False)
test_indices=[]

for i in all_indices:
	if i not in training_indices:
		test_indices.append(i)

test_labels = dataset_Y.iloc[test_indices].to_numpy()

W_list = []
validation_X_list =[]
validation_Y_list =[]
seeds = [27,20,30,74,56,76,56,78,90,10,23,49,56]
splitting_seeds = [96, 98, 32, 54, 10, 15, 28, 95, 12, 39, 48, 68, 71]

Testset_X = dataset_X.iloc[test_indices]
Testset_Y = dataset_Y.iloc[test_indices]


print("TestSet_X shape:")
print(Testset_X.shape)


for i in range(0,9):
	
	np.random.seed(seeds[i])
	random_indices = np.random.choice(training_indices, size=(training_size), replace=True)

	# Training + Validation sets ----------------
	dataset_X_i = dataset_X.iloc[random_indices]
	dataset_Y_i = dataset_Y.iloc[random_indices]
	# Training + Validation sets ----------------

	train_X, train_Y, validation_X, validation_Y = dataset_splitting(dataset_X_i, dataset_Y_i, 0.2, splitting_seeds[i])
	validation_X_list.append(validation_X)
	validation_Y_list.append(validation_Y)
	W, alpha, itr = cross_validation(train_X, train_Y, validation_X, validation_Y, seeds[i])
	W_list.append(W) 


# ----------- We have 9 different LR models (W's) -------------
Binary_Prediction_list =[]
Prediction_list =[]
accuracy_list = []
sensitivity_list=[]
specificity_list=[]
precision_list=[]
f1_score_list=[]
auroc_list=[]
aupr_list=[]

final_predictions = np.zeros((Testset_Y.shape[0],1))
for i in range(0, 9):
	accuracy, binary_predictions, predictions = get_accuracy_results(Testset_X.to_numpy(), Testset_Y.to_numpy(), W_list[i])
	
	print(accuracy)

	Binary_Prediction_list.append(binary_predictions)
	Prediction_list.append(predictions)

	final_predictions=final_predictions+binary_predictions

	accuracy_list.append(accuracy)
	sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(Testset_Y, binary_predictions, predictions)
	print(sensitivity, specificity, precision, f1_score, auroc, aupr)
	sensitivity_list.append(sensitivity)
	specificity_list.append(specificity)
	precision_list.append(precision)
	f1_score_list.append(f1_score)
	auroc_list.append(auroc)
	aupr_list.append(aupr)
	


for i in range(len(final_predictions)):
	if final_predictions[i] >=5:
		final_predictions[i]=1
	else:
		final_predictions[i]=0


correct_predictions = np.sum(final_predictions == Testset_Y.to_numpy())
accuracy = correct_predictions / Testset_Y.shape[0]
sensitivity = recall_score(Testset_Y, final_predictions)
print("Final Accuracy:", accuracy)
print("final Sensitivity: ", sensitivity)
print(sensitivity_list)

stacked_predictions_1 = np.hstack(Prediction_list)
mean_predictions = np.mean(stacked_predictions_1, axis=1)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(Testset_Y, final_predictions, mean_predictions)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accuracy, sensitivity, specificity, precision, f1_score, auroc, aupr])




In [None]:
accu_mean,sens_mean, spec_mean, pre_mean, f1_mean, auroc_mean, aupr_mean = get_mean_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list)
accu_std,sens_std, spec_std, pre_std, f1_std, auroc_std, aupr_std = get_stdev_values(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list)
print("Dataset-3 Mean Scores for Bagging:")
print("Accuracy: ", accu_mean, "Sensitivity: ", sens_mean, "Specificity: ", spec_mean, "Precision: ", pre_mean, "F1_score: ", f1_score, "AUROC: ", auroc_mean, "AUPR: ", aupr_mean)
print("Dataset-3 Standard Deviations of Metrics for Bagging:")
print("Accuracy: ", accu_std, "Sensitivity: ", sens_std, "Specificity: ", spec_std, "Precision: ", pre_std, "F1_score: ", f1_std, "AUROC: ", auroc_std, "AUPR: ", aupr_std)
main_list.append([accu_mean,sens_mean, spec_mean, pre_mean, f1_mean, auroc_mean, aupr_mean])
main_list.append([accu_std,sens_std, spec_std, pre_std, f1_std, auroc_std, aupr_std])

In [None]:
plot_violins(accuracy_list, sensitivity_list, specificity_list, precision_list, f1_score_list, auroc_list, aupr_list,4)

In [None]:
# preparing the validation data for input in stacking
df_X = pd.DataFrame(np.vstack(validation_X_list))
df_Y = pd.DataFrame(np.vstack(validation_Y_list),columns=["Y"])
merged_dataframe = pd.concat([df_X, df_Y], axis=1)
merged_dataframe = merged_dataframe.drop_duplicates()
final_X = merged_dataframe.drop(columns=["Y"]).to_numpy() 
final_Y = merged_dataframe["Y"].to_numpy().reshape(-1,1)
print("Final X shape:", final_X.shape)
print("Final Y shape:", final_Y.shape)

In [None]:
dataset_stacking_X = pd.DataFrame(final_X)

for i in range(0,9):
	accuracy, predictions,_ = get_accuracy_results(final_X, final_Y, W_list[i])
	df_temp = pd.DataFrame(predictions)
	dataset_stacking_X = pd.concat([dataset_stacking_X, df_temp], axis=1)

dataset_stacking_Y = pd.DataFrame(final_Y)
print(dataset_stacking_X.shape)
print(dataset_stacking_Y.shape)

In [None]:
init_train_X_stacking_2, init_train_Y_stacking_2, test_X_stacking_2, test_Y_stacking_2  = dataset_splitting(dataset_stacking_X, dataset_stacking_Y, 0.2, 90)
train_X_stacking_2, train_Y_stacking_2, validation_X_stacking_2, validation_Y_stacking_2 = dataset_splitting(pd.DataFrame(init_train_X_stacking_2), pd.DataFrame(init_train_Y_stacking_2), 0.2, 39)
W_stacking_2, alpha_stacking_2, itr_stacking_2 = cross_validation(train_X_stacking_2, train_Y_stacking_2, validation_X_stacking_2, validation_Y_stacking_2, 48)
accu_stacking_3, binary_predictions_stacking_2, predictions_stacking_2 = get_accuracy_results(test_X_stacking_2, test_Y_stacking_2, W_stacking_2)
sensitivity, specificity, precision, f1_score, auroc, aupr = getParameters(test_Y_stacking_2, binary_predictions_stacking_2, predictions_stacking_2)
print("accuracy: ", accu_stacking_3, "optimal_alpha: ", alpha_stacking_2, "optimal_iteration_num: ", itr_stacking_2)
print(sensitivity, specificity, precision, f1_score, auroc, aupr)
main_list.append([accu_stacking_3, sensitivity, specificity, precision, f1_score, auroc, aupr])

In [46]:
df = pd.DataFrame(main_list, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'F1 Score', 'AUROC', 'AUPR'])
df.to_csv('metrics_summary.csv', index=False)