Ονοματεπώνυμο: Γαβριηλία-Μιχαηλία Πολυχρονίου
ΑΜ: 5084

In [None]:
import os
import csv
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.applications import VGG16
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.applications import ResNet50
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from tensorflow.keras.layers import Dense, Input, Conv2D, MaxPooling2D, BatchNormalization, Flatten, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

test_file = "C:/Users/userG/Desktop/Lab1/Lab1/TEST_images/TEST_images"
train_file = "C:/Users/userG/Desktop/Lab1/Lab1/train-images/train"
csv_file = "C:/Users/userG/Desktop/Lab1/Lab1/Test-IDs.csv"

#Load train-images
images = []
labels = []
categories = ["unpleasant","pleasant"]

for expression in categories:
    folder = os.path.join(train_file,expression) #path to each class folder
    filenames = os.listdir(folder) #load image filenames
    
    for filename in filenames:
        labels.append(expression) #store label as string
        image_path = os.path.join(folder,filename) #load image
        image = load_img(image_path,target_size = (85,85)) #load image with size 85x85
        images.append(img_to_array(image)) #convert image to array and append to list
        
x = np.array(images) / 255.0 #normalize pixels values to [0,1]
y = np.array(labels)
y = LabelEncoder().fit_transform(y) #transforms string labels to 0,1


In [2]:
#Validation for test
test_ids = []
filenames_csv = []

with open(csv_file, mode = "r", newline = '', encoding = "utf-8-sig") as file:
    reader = csv.DictReader(file)
    for row in reader:
        row = {key.strip(): value for key, value in row.items()}
        test_ids.append(row["ID"])
        filenames_csv.append(row["Filename"])

validation = []
for f in filenames_csv:
    img_path = os.path.join(test_file, f)
    if os.path.exists(img_path):
        img = load_img(img_path, target_size = (85,85))
        img_array = img_to_array(img)
        img_array = img_array
        validation.append(img_array)
        
validation = np.array(validation) / 255.0

#CNN
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
best_f1 = -1
best_model = None
best_architecture = None

for dense_layers in [1]:
    for dropout_rate in [0.2]:
        for batchSIZE in [25]:
            fold_f1_scores = []
            for fold, (train_index, test_index) in enumerate(kf.split(x)):
                x_train_fold, x_test_fold = x[train_index], x[test_index]
                y_train_fold, y_test_fold = y[train_index], y[test_index]

                #Define CNN model
                model = Sequential()
                model.add(Input(shape = (85, 85, 3))) 
              
                #First convolutional layer
                model.add(Conv2D(25, (3, 3), activation = "relu")) 
                model.add(BatchNormalization())
                model.add(MaxPooling2D(pool_size = (2, 2)))
                
                #Second convolutional layer
                model.add(Conv2D(50, (3, 3), activation = "relu")) 
                model.add(BatchNormalization())
                model.add(MaxPooling2D(pool_size = (2, 2)))
                
                #Third convolutional layer
                model.add(Conv2D(75, (3, 3), activation = "relu")) 
                model.add(BatchNormalization())
                model.add(MaxPooling2D(pool_size = (2, 2)))
                
                model.add(Flatten())
                
                #dense layers
                for i in range(dense_layers):
                    model.add(Dense(150, activation = "relu")) 
                    model.add(Dropout(dropout_rate))
                    
                model.add(Dense(1, activation = "sigmoid")) #last level
                
                model.compile(optimizer = Adam(), loss = "binary_crossentropy", metrics = ["accuracy"])
                model.fit(x_train_fold, y_train_fold, epochs = 10, batch_size = batchSIZE, verbose = 0)

                #Predict and evaluate using F1 score
                y_test_pred = (model.predict(x_test_fold) > 0.5).astype(int) 
                f1 = f1_score(y_test_fold, y_test_pred, average = "weighted")
                fold_f1_scores.append(f1)
    
            max_f1_in_fold = np.max(fold_f1_scores)
            print(dense_layers, dropout_rate, batchSIZE, max_f1_in_fold)
                
            if max_f1_in_fold > best_f1:
                best_f1 = max_f1_in_fold
                best_model = model
                best_architecture = {"dense_layers": dense_layers, "dropout_rate": dropout_rate, "batch_size": batchSIZE}
                
print(f"Best F1 score: {best_f1}, Best Architecture: {best_architecture}")

1 0.2 25 0.9231270041290501
Best F1 score: 0.9231270041290501, Best Architecture: {'dense_layers': 1, 'dropout_rate': 0.2, 'batch_size': 25}


In [4]:
y_pred = (best_model.predict(x) > 0.5).astype(int)

f1 = f1_score(y, y_pred, average = "weighted")
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
conf_matrix = confusion_matrix(y, y_pred)

print(f"F1-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Confusion Matrix: {conf_matrix}")

F1-score: 0.9822729077922686
Accuracy: 0.9822772898368883
Precision: 0.9847921595133491
Recall: 0.97719651240778
Confusion Matrix: [[3349   45]
 [  68 2914]]


In [5]:
#Predict labels for the test images and save to CNNpreds.csv file
y_pred_prob = best_model.predict(validation)

results_df = pd.DataFrame({"ID": test_ids, "LABEL": y_pred_prob.flatten()})
results_df["LABEL"] = results_df["LABEL"].apply(lambda x: 1 if x <= 0.5 else 0)
results_df.to_csv("CNNpreds.csv", index = False)



In [6]:
#Pretrained CNN - ResNet-50
resnet = ResNet50(weights = "imagenet", include_top = False, input_shape = (85, 85, 3))
resnet.trainable = False
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

x = preprocess_input(x)  # normalize
feature_map = resnet.predict(x)
x = global_average_layer(feature_map).numpy()  
        
validation = preprocess_input(validation)
feature_map = resnet.predict(validation)
validation = global_average_layer(feature_map).numpy()



In [7]:
#K-Nearest Neighbors (K-NN)
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
best_f1 = -1
best_n = None
best_fold = None
best_metric = None
distance_metrics = ["euclidean","cosine"]

for k in [1, 3, 10]:
    for dist_metric in distance_metrics:
        fold_f1_scores = []
        for fold, (train_index, test_index) in enumerate(kf.split(x)):
            x_train_fold, x_test_fold = x[train_index], x[test_index]
            y_train_fold, y_test_fold = y[train_index], y[test_index]
            
            knn = KNeighborsClassifier(n_neighbors = k, metric = dist_metric)
            y_test_pred = knn.fit(x_train_fold, y_train_fold).predict(x_test_fold)
            
            f1 = f1_score(y_test_fold, y_test_pred, average = "weighted")
            fold_f1_scores.append(f1)
            
        max_f1_in_fold = np.max(fold_f1_scores)
        print(k, dist_metric, max_f1_in_fold)
        
        if max_f1_in_fold > best_f1:
            best_n = k
            best_metric = dist_metric
            best_f1 = max_f1_in_fold
            
print(f"Best F1 score: {best_f1}, Best n: {best_n}, Best distance metric: {best_metric}")

1 euclidean 0.7820101789492034
1 cosine 0.7727186393227174
3 euclidean 0.7989194167051563
3 cosine 0.7910327971599793
10 euclidean 0.7879541644573221
10 cosine 0.7787033817801099
Best F1 score: 0.7989194167051563, Best n: 3, Best distance metric: euclidean


In [8]:
knn = KNeighborsClassifier(n_neighbors = best_n, metric = best_metric)
y_pred = knn.fit(x, y).predict(x)

f1 = f1_score(y, y_pred, average = "weighted")

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
conf_matrix = confusion_matrix(y, y_pred)

print(f"F1-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Confusion Matrix: {conf_matrix}")

F1-score: 0.867216218366757
Accuracy: 0.8671580928481807
Precision: 0.8191330343796711
Recall: 0.9188464118041583
Confusion Matrix: [[2789  605]
 [ 242 2740]]


In [9]:
y_pred_prob = knn.predict(validation)
results_df = pd.DataFrame({"ID": test_ids, "LABEL": y_pred_prob.flatten()})
results_df["LABEL"] = results_df["LABEL"].apply(lambda x: 1 if x <= 0.5 else 0)
results_df.to_csv("KNNpreds.csv", index = False)

In [7]:
# SVM
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
best_f1 = -1
best_C = None
best_kernel = None
kernel_function = ["linear","rbf"]

for kernel_func in kernel_function:
    for regularization_param in [0.1, 1, 10, 100]:
        fold_f1_scores = []
        for fold, (train_index, test_index) in enumerate(kf.split(x)):
            x_train_fold, x_test_fold = x[train_index], x[test_index]
            y_train_fold, y_test_fold = y[train_index], y[test_index]
            
            svm = SVC(kernel = kernel_func, C = regularization_param, probability = True) 
            y_test_pred = svm.fit(x_train_fold, y_train_fold).predict(x_test_fold)
            
            f1 = f1_score(y_test_fold, y_test_pred, average = "weighted")
            fold_f1_scores.append(f1)
            
        max_f1_in_fold = np.max(fold_f1_scores)
        print(kernel_func, regularization_param, max_f1_in_fold)     
            
        if max_f1_in_fold > best_f1:
            best_C = regularization_param
            best_kernel = kernel_func
            best_f1 = max_f1_in_fold
print(f"Best F1 score: {best_f1}, Best kernel: {best_kernel}, Best C: {best_C}")

linear 0.1 0.8237070060099884
linear 1 0.8366984792260687
linear 10 0.8652087529566013
linear 100 0.8697152148464781
rbf 0.1 0.40256926172419133
rbf 1 0.7315740122099141
rbf 10 0.7900149750621898
rbf 100 0.8206704417034804
Best F1 score: 0.8697152148464781, Best kernel: linear, Best C: 100


In [8]:
svm = SVC(kernel = best_kernel, C = best_C, probability = True)
y_pred = svm.fit(x, y).predict(x)

recall = recall_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
conf_matrix = confusion_matrix(y, y_pred)
f1 = f1_score(y, y_pred, average = "weighted")

print(f"F1-score: {f1}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Confusion Matrix:\\n{conf_matrix}")

F1-score: 0.8743354267772216
Recall: 0.8896713615023474
Accuracy: 0.874215809284818
Precision: 0.8486884197056942
Confusion Matrix:\n[[2921  473]
 [ 329 2653]]


In [10]:
y_pred_prob = svm.predict(validation)
results_df = pd.DataFrame({"ID": test_ids,"LABEL": y_pred_prob.flatten()})
results_df["LABEL"] = results_df["LABEL"].apply(lambda x: 1 if x <= 0.5 else 0)
results_df.to_csv("SVMspreds.csv", index = False)

In [11]:
#Shallow and Deep Neural Networks
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)
best_f1_shallow = -1
best_params_shallow = {}
activation_functions = ["relu", "sigmoid", "tanh"]
optimization_algorithms = [Adam, SGD]

#Shallow Neural Network
for neurons in [512, 256, 128]:
    for activation_func in activation_functions:
        for optimizer_alg in optimization_algorithms:
            fold_f1_scores_shallow = []
            for fold, (train_index, test_index) in enumerate(kf.split(x)):
                x_train_fold, x_test_fold = x[train_index], x[test_index]
                y_train_fold, y_test_fold = y[train_index], y[test_index]
    
                shallow = Sequential()
                shallow.add(Input(shape=(x_train_fold.shape[1],)))
                shallow.add(Dense(neurons, activation = activation_func))  # 1 hidden layer
                shallow.add(Dense(1, activation = "sigmoid"))  # output layer
                optimizer_instance = optimizer_alg()  
                shallow.compile(loss = "binary_crossentropy", optimizer = optimizer_instance, metrics = ["accuracy"])
                shallow.fit(x_train_fold, y_train_fold, epochs = 10, verbose = 0)   
                                                                             
                y_pred_fold = (shallow.predict(x_test_fold) > 0.5).astype(int)  
                                                                                  
                f1 = f1_score(y_test_fold, y_pred_fold)
                fold_f1_scores_shallow.append(f1)
            
            max_f1_shallow = np.max(fold_f1_scores_shallow)
            print(neurons, activation_func, optimizer_alg, max_f1_shallow)
            
            if max_f1_shallow > best_f1_shallow:
                best_f1_shallow = max_f1_shallow
                best_params_shallow = {"neurons": neurons, "activation": activation_func, "optimizer": optimizer_alg.__name__}
                
print(f"Best Shallow Neural Network F1 Score: {best_f1_shallow:.4f}")
print(f"Best Shallow Neural Network Parameters: {best_params_shallow}")

512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8304093567251462
512 relu <class 'keras.optimizers.optimizer_v2.gradient_descent.SGD'> 0.6973848069738481
512 sigmoid <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.7777777777777778
512 sigmoid <class 'keras.optimizers.optimizer_v2.gradient_descent.SGD'> 0.6700636942675159
512 tanh <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8227848101265823
512 tanh <class 'keras.optimizers.optimizer_v2.gradient_descent.SGD'> 0.7119113573407202
256 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.805111821086262
256 relu <class 'keras.optimizers.optimizer_v2.gradient_descent.SGD'> 0.6884480746791132
256 sigmoid <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.7708029197080292
256 sigmoid <class 'keras.optimizers.optimizer_v2.gradient_descent.SGD'> 0.6524822695035462
256 tanh <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.7876230661040787
256 tanh <class 'keras.optimizers.optimizer_v2.gradient_descent.SGD'> 0.7121

In [12]:
best_shallow = Sequential()
best_shallow.add(Input(shape=(x.shape[1],)))
best_shallow.add(Dense(best_params_shallow["neurons"], activation = best_params_shallow["activation"]))
best_shallow.add(Dense(1, activation = "sigmoid"))  # Output 
   
if best_params_shallow["optimizer"] == "Adam":
    best_optimizer_shallow = Adam()
elif best_params_shallow["optimizer"] == "SGD":
    best_optimizer_shallow = SGD()
    
best_shallow.compile(loss = "binary_crossentropy", optimizer = best_optimizer_shallow, metrics = ["accuracy"])
best_shallow.fit(x, y, epochs = 10, verbose = 0)
y_pred_shallow = (best_shallow.predict(x) > 0.5).astype(int)

f1_shallow = f1_score(y, y_pred_shallow)
cm_shallow = confusion_matrix(y, y_pred_shallow)
recall_shallow = recall_score(y, y_pred_shallow)
accuracy_shallow = accuracy_score(y, y_pred_shallow)
precision_shallow = precision_score(y, y_pred_shallow)
fpr_shallow, tpr_shallow, _ = roc_curve(y, best_shallow.predict(x))
roc_auc_shallow = auc(fpr_shallow, tpr_shallow)
    
print(f"Shallow Neural Network F1 Score: {f1_shallow:.4f}")
print(f"Shallow Neural Network Recall: {recall_shallow:.4f}")
print(f"Shallow Neural Network ROC AUC: {roc_auc_shallow:.4f}")
print(f"Shallow Neural Network Accuracy: {accuracy_shallow:.4f}")
print(f"Shallow Neural Network Precision: {precision_shallow:.4f}")
    
print("Confusion Matrix:")
print(cm_shallow)

Shallow Neural Network F1 Score: 0.7782
Shallow Neural Network Recall: 0.8571
Shallow Neural Network ROC AUC: 0.8554
Shallow Neural Network Accuracy: 0.7715
Shallow Neural Network Precision: 0.7126
Confusion Matrix:
[[2363 1031]
 [ 426 2556]]


In [13]:
#Deep Neural Network
best_f1_deep = -1
best_params_deep = {}

for num_layers in [2, 3, 4]:
    for neurons in [512, 256, 128, 64]:
        for activation_func in activation_functions:
            for optimizer_alg in optimization_algorithms:
                fold_f1_scores_deep = []
                for fold, (train_index, test_index) in enumerate(kf.split(x)):
                    x_train_fold, x_test_fold = x[train_index], x[test_index]
                    y_train_fold, y_test_fold = y[train_index], y[test_index]
   
                    deep = Sequential()
                    deep.add(Input(shape=(x_train_fold.shape[1],)))
    
                    for i in range(num_layers):
                        deep.add(Dense(neurons, activation = activation_func))
                        deep.add(Dense(1, activation = "sigmoid"))  # output layer
   
                        optimizer_instance = optimizer_alg()  
                        deep.compile(loss = "binary_crossentropy", optimizer = optimizer_instance, metrics = ["accuracy"])
                        deep.fit(x_train_fold, y_train_fold, epochs = 10, verbose = 0)  
    
                        y_pred_fold = (deep.predict(x_test_fold) > 0.5).astype(int)  
                                                                                    
                        f1 = f1_score(y_test_fold, y_pred_fold)
                        fold_f1_scores_deep.append(f1)
    
                    max_f1_deep = np.max(fold_f1_scores_deep)
                    print(num_layers, neurons, activation_func, optimizer_alg, max_f1_deep)     
  
                    if max_f1_deep > best_f1_deep:
                        best_f1_deep = max_f1_deep
                        best_params_deep = {"num_layers": num_layers, "neurons": neurons, "activation": activation_func, "optimizer": optimizer_alg.__name__}
                        
print(f"Best Deep Neural Network F1 Score: {best_f1_deep:.4f}")
print(f"Best Deep Neural Network Parameters: {best_params_deep}")

2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.7953890489913544
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.adam.Adam'> 0.8050314465408805
2 512 relu <class 'keras.optimizers.optimizer_v2.gradient_descent.SGD'> 0.06535947712418301
2 512 relu <class 'keras.optimizers.optimizer_v2.gradient_descent.SGD'> 0.6772009029345373
2 512 relu <class

In [14]:
best_deep = Sequential()
best_deep.add(Input(shape=(x.shape[1],)))
    
for i in range(best_params_deep["num_layers"]):
    best_deep.add(Dense(best_params_deep["neurons"], activation = best_params_deep["activation"]))
best_deep.add(Dense(1, activation = "sigmoid"))  # Output
    
if best_params_deep["optimizer"] == "Adam":
    best_optimizer_deep = Adam()
elif best_params_deep["optimizer"] == "SGD":
    best_optimizer_deep = SGD()
    
best_deep.compile(loss = "binary_crossentropy", optimizer = best_optimizer_deep, metrics = ["accuracy"])
best_deep.fit(x, y, epochs = 10, verbose = 0)
    
y_pred_deep = (best_deep.predict(x) > 0.5).astype(int)
    
f1_deep = f1_score(y, y_pred_deep)

accuracy_deep = accuracy_score(y, y_pred_deep)
precision_deep = precision_score(y, y_pred_deep)
recall_deep = recall_score(y, y_pred_deep)
cm_deep = confusion_matrix(y, y_pred_deep)
    
print(f"Deep Neural Network F1 Score: {f1_deep:.4f}")
print(f"Deep Neural Network Accuracy: {accuracy_deep:.4f}")
print(f"Deep Neural Network Precision: {precision_deep:.4f}")
print(f"Deep Neural Network Recall: {recall_deep:.4f}")
print("Confusion Matrix:")
print(cm_deep)

Deep Neural Network F1 Score: 0.7910
Deep Neural Network Accuracy: 0.7785
Deep Neural Network Precision: 0.7080
Deep Neural Network Recall: 0.8960
Confusion Matrix:
[[2292 1102]
 [ 310 2672]]


In [17]:
y_pred_prob = best_deep.predict(validation)  

results_df = pd.DataFrame({"ID": test_ids,"LABEL": y_pred_prob.flatten()})
results_df["LABEL"] = results_df["LABEL"].apply(lambda x: 1 if x <= 0.5 else 0)
results_df.to_csv("DEEPNNpreds.csv", index = False)




In [18]:
y_pred_prob = best_shallow.predict(validation)  

results_df = pd.DataFrame({"ID": test_ids,"LABEL": y_pred_prob.flatten()})
results_df["LABEL"] = results_df["LABEL"].apply(lambda x: 1 if x <= 0.5 else 0)
results_df.to_csv("SHALLOWNNpreds.csv", index = False)



In [19]:
#Ensemble Learning methodologies
kf = KFold(n_splits = 10, shuffle = True, random_state = 42)

#Random Forest
best_f1_rf = -1
best_n_estimators_rf = None
best_max_depth_rf = None
maximum_depth = [5,10]

for decision_trees in [10, 50, 100, 200]:
    for max_depth in maximum_depth:
        fold_f1_scores_rf = []
        for fold, (train_index, test_index) in enumerate(kf.split(x)):
            x_train_fold, x_test_fold = x[train_index], x[test_index]
            y_train_fold, y_test_fold = y[train_index], y[test_index]
    
            rf = RandomForestClassifier(n_estimators = decision_trees, max_depth = max_depth)
            y_test_pred = rf.fit(x_train_fold, y_train_fold).predict(x_test_fold) 
                                                                                     
    
            f1 = f1_score(y_test_fold, y_test_pred, average = "weighted")
            fold_f1_scores_rf.append(f1)
    
        max_f1_in_fold_rf = np.max(fold_f1_scores_rf)
        print(decision_trees, max_depth, max_f1_in_fold_rf) 
   
        if max_f1_in_fold_rf > best_f1_rf:
            best_f1_rf = max_f1_in_fold_rf
            best_n_estimators_rf = decision_trees
            best_max_depth_rf = max_depth
            
print(f"Best Random Forest F1 score: {best_f1_rf}, 
Best n_estimators: {best_n_estimators_rf}, Best max_depth: {best_max_depth_rf}")
    

10 5 0.7612261802129443
10 10 0.7790799525596733
50 5 0.7540681759083872
50 10 0.800720319750018
100 5 0.7644399374656694
100 10 0.8025121458346214
200 5 0.764277320420603
200 10 0.8162504725334508
Best Random Forest F1 score: 0.8162504725334508, Best n_estimators: 200, Best max_depth: 10


In [21]:
rf = RandomForestClassifier(n_estimators = best_n_estimators_rf, max_depth = best_max_depth_rf)
y_pred_rf = rf.fit(x, y).predict(x)

f1 = f1_score(y, y_pred_rf, average = "weighted")

accuracy = accuracy_score(y, y_pred_rf)
precision = precision_score(y, y_pred_rf)
recall = recall_score(y, y_pred_rf)
conf_matrix = confusion_matrix(y, y_pred_rf)
   
print("Random Forest")
print(f"F1-score: {f1}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Confusion Matrix:{conf_matrix}")

Random Forest
F1-score: 0.9427858418444395
Recall: 0.9969818913480886
Accuracy: 0.9427540777917189
Precision: 0.893060979273055
Confusion Matrix:[[3038  356]
 [   9 2973]]


In [23]:
#Bagging Classifier
best_f1_bagging = -1
best_n_estimators_bagging = None

for estimators in [10, 20]:
    fold_f1_scores_bagging = []
    for fold, (train_index, test_index) in enumerate(kf.split(x)):
        x_train_fold, x_test_fold = x[train_index], x[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
   
        svm = SVC(kernel = "linear", probability = True)
        bagging = BaggingClassifier(estimator = svm, n_estimators = estimators)
        y_test_pred = bagging.fit(x_train_fold, y_train_fold).predict(x_test_fold) 
   
        f1 = f1_score(y_test_fold, y_test_pred, average = "weighted")
        fold_f1_scores_bagging.append(f1)
   
    max_f1_in_fold_bagging = np.max(fold_f1_scores_bagging)
    print(estimators, max_f1_in_fold_bagging)
    
    if max_f1_in_fold_bagging > best_f1_bagging:
        best_f1_bagging = max_f1_in_fold_bagging
        best_n_estimators_bagging = estimators
        
print(f"Best Bagging F1 score: {best_f1_bagging}, Best n_estimators: {best_n_estimators_bagging}")

10 0.8435896493524485
20 0.8420254351473049
Best Bagging F1 score: 0.8435896493524485, Best n_estimators: 10


In [24]:
svm_base = SVC(kernel = "linear", probability = True)
bagging = BaggingClassifier(estimator = svm_base, n_estimators = best_n_estimators_bagging)
y_pred_bagging = bagging.fit(x, y).predict(x)

f1 = f1_score(y, y_pred_bagging, average = "weighted")
accuracy = accuracy_score(y, y_pred_bagging)
precision = precision_score(y, y_pred_bagging)
recall = recall_score(y, y_pred_bagging)
conf_matrix = confusion_matrix(y, y_pred_bagging)

print("Bagging")
print(f"F1-score: {f1}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Confusion Matrix:{conf_matrix}")

Bagging
F1-score: 0.8372094154188516
Recall: 0.8537894030851777
Accuracy: 0.8370451693851945
Precision: 0.8085106382978723
Confusion Matrix:[[2791  603]
 [ 436 2546]]


In [25]:
#AdaBoost
best_f1_adaboost = -1
best_n_estimators_adaboost = None

for estimatorsr in [100, 150, 200]:
    fold_f1_scores_adaboost = []
    for fold, (train_index, test_index) in enumerate(kf.split(x)):
        x_train_fold, x_test_fold = x[train_index], x[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
    
        dt = DecisionTreeClassifier(max_depth = 1)
        adaboost = AdaBoostClassifier(estimator = dt, n_estimators = estimators)
        y_test_pred = adaboost.fit(x_train_fold, y_train_fold).predict(x_test_fold) 
    
        f1 = f1_score(y_test_fold, y_test_pred, average = "weighted")
        fold_f1_scores_adaboost.append(f1)
    
    max_f1_in_fold_adaboost = np.max(fold_f1_scores_adaboost)
    print(estimators, max_f1_in_fold_adaboost)
    
    if max_f1_in_fold_adaboost > best_f1_adaboost:
        best_f1_adaboost = max_f1_in_fold_adaboost
        best_n_estimators_adaboost = estimators
        
print(f"Best AdaBoost F1 score: {best_f1_adaboost}, Best n_estimators: {best_n_estimators_adaboost}")

100 0.7963183891616324
150 0.8147749339887381
200 0.8272772376429572
Best AdaBoost F1 score: 0.8272772376429572, Best n_estimators: 200


In [26]:
dt_base = DecisionTreeClassifier(max_depth = 1)
adaboost = AdaBoostClassifier(estimator = dt_base, n_estimators = best_n_estimators_adaboost)

y_pred_adaboost = adaboost.fit(x, y).predict(x)

recall = recall_score(y, y_pred_adaboost)
accuracy = accuracy_score(y, y_pred_adaboost)
precision = precision_score(y, y_pred_adaboost)
conf_matrix = confusion_matrix(y, y_pred_adaboost)
f1 = f1_score(y, y_pred_adaboost, average = "weighted")

print("AdaBoost")
print(f"F1-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Confusion Matrix:{conf_matrix}")

AdaBoost
F1-score: 0.8381281535107177
Accuracy: 0.8379861982434128
Precision: 0.8138486312399356
Recall: 0.8474178403755869
Confusion Matrix:[[2816  578]
 [ 455 2527]]


In [30]:
y_pred_prob = rf.predict(validation) 

results_df = pd.DataFrame({"ID": test_ids,"LABEL": y_pred_prob.flatten()})
results_df["LABEL"] = results_df["LABEL"].apply(lambda x: 1 if x <= 0.5 else 0)
results_df.to_csv("RFpreds.csv", index = False)


In [31]:
y_pred_prob = bagging.predict(validation)  

results_df = pd.DataFrame({"ID": test_ids,"LABEL": y_pred_prob.flatten()})
results_df["LABEL"] = results_df["LABEL"].apply(lambda x: 1 if x <= 0.5 else 0)
results_df.to_csv("BAGGINGpreds.csv", index = False)

In [32]:
y_pred_prob = adaboost.predict(validation) 

results_df = pd.DataFrame({"ID": test_ids,"LABEL": y_pred_prob.flatten()})
results_df["LABEL"] = results_df["LABEL"].apply(lambda x: 1 if x <= 0.5 else 0)
results_df.to_csv("ADABOOSTpreds.csv", index = False)