In [None]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk
# import xlsxwriter
import openpyxl
import os.path
from os.path import exists


In [None]:
df_data = pd.read_csv("../data/train.csv")
df_data.head()

In [None]:
df_data.describe()

In [None]:
def scale10(n):
    val = 0
    if n <=10: 
        val = 1
    elif n >10 and n<=20:
        val = 2
    elif n >20 and n <=30:
        val = 3
    elif n>30 and n <=40:
        val = 4
    elif n > 40 and n<=50:
        val = 5
    elif n>50 and n<=60:
        val = 6
    elif n>60 and n<=70:
        val = 7
    elif n>70 and n<=80:
        val = 8
    elif n>80 and n<=90:
        val = 9
    else: 
        val = 10
    return(val)

In [None]:
def scale5(n):
    val = 0
    if n <=20: 
        val = 1
    elif n >20 and n<=40:
        val = 2
    elif n >40 and n <=60:
        val = 3
    elif n>60 and n <=80:
        val = 4
    else: val = 5

    return(val)

In [None]:
def scale4(n):
    val = 0
    if n <=25: 
        val = 1
    elif n >25 and n<=50:
        val = 2
    elif n >50 and n <=75:
        val = 3
    else: 
        val = 4

    return(val)

In [None]:
def scale3(n):
    val = 0
    if n <=34: 
        val = 1
    elif n >34 and n<=68:
        val = 2
    else:
        val = 3

    return(val)

In [None]:
def scale2(n):
    val = 0
    if n <=50: 
        val = 1
    else:
        val = 2

    return(val)

In [None]:
df_data["Scale10"] = df_data.apply(lambda row: scale10(row.Pawpularity), axis=1)

In [None]:
df_data["Scale5"] = df_data.apply(lambda row: scale5(row.Pawpularity), axis=1)

In [None]:
df_data["Scale4"] = df_data.apply(lambda row: scale4(row.Pawpularity), axis=1)

In [None]:
df_data["Scale3"] = df_data.apply(lambda row: scale3(row.Pawpularity), axis=1)

In [None]:
df_data["Scale2"] = df_data.apply(lambda row: scale2(row.Pawpularity), axis=1)

In [None]:
df_data.head()

### Splitting Data

In [None]:
columnsL = df_data.columns
columnsL

In [None]:
#Get X and Y data - shuffle data.
X_cols = ['Subject Focus','Eyes', 'Face','Near','Action','Accessory','Group','Collage','Human','Occlusion','Info','Blur',]
X = np.array(df_data[X_cols])
Y = df_data['Pawpularity'].values[:]

id_image = df_data['Id'].values[:]

Y10 = df_data['Scale10'].values[:]
Y5 = df_data['Scale5'].values[:]
Y4 = df_data['Scale4'].values[:]
Y3 = df_data['Scale3'].values[:]
Y2 = df_data['Scale2'].values[:]


shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y, id_image = X[shuffle], Y[shuffle], id_image[shuffle]
Y10, Y5, Y4, Y3, Y2 = Y10[shuffle], Y5[shuffle], Y4[shuffle], Y3[shuffle], Y2[shuffle]

In [None]:
# Define sizes for train, development and test data (0.5, 0.2, 0.3)
per_train = 0.5
per_dev = 0.2

num_images = len(Y)
train_size = int(round(num_images * per_train,0))
dev_size = int(round(num_images * per_dev,0))

In [None]:
# Split data based on defined sizes
test_data, test_labels, id_test = X[train_size+dev_size:], Y[train_size+dev_size:], id_image[train_size+dev_size:]
test_y10 = Y10[train_size+dev_size:]
test_y5 = Y5[train_size+dev_size:]
test_y4 = Y4[train_size+dev_size:]
test_y3 = Y3[train_size+dev_size:]
test_y2 = Y2[train_size+dev_size:]

dev_data, dev_labels, id_dev = X[train_size:train_size+dev_size], Y[train_size:train_size+dev_size], id_image[train_size:train_size+dev_size]
dev_y10 = Y10[train_size:train_size+dev_size]
dev_y5 = Y5[train_size:train_size+dev_size]
dev_y4 = Y4[train_size:train_size+dev_size]
dev_y3 = Y3[train_size:train_size+dev_size]
dev_y2 = Y2[train_size:train_size+dev_size]

train_data, train_labels, id_train = X[:train_size], Y[:train_size], id_image[:train_size]
train_y10 =  Y10[:train_size]
train_y5 =  Y5[:train_size]
train_y4 =  Y4[:train_size]
train_y3 =  Y3[:train_size]
train_y2 =  Y2[:train_size]

print(num_images)
print(train_data.shape, train_labels.shape, id_train.shape)
print(dev_data.shape, dev_labels.shape, id_dev.shape)
print(test_data.shape, test_labels.shape, id_test.shape)
print(test_y10.shape, dev_y10.shape, train_y10.shape)
print(test_y5.shape, dev_y5.shape, train_y5.shape)
print(test_y4.shape, dev_y4.shape, train_y4.shape)
print(test_y3.shape, dev_y3.shape, train_y3.shape)
print(test_y2.shape, dev_y2.shape, train_y2.shape)

In [None]:
def wrt_excel(file, sheet_name, df):
    if os.path.exists(file):
        with pd.ExcelWriter(file, engine="openpyxl", mode='a') as writer:
            df.to_excel(writer, sheet_name=sheet_name)
    else:
        with pd.ExcelWriter(file, engine="openpyxl") as writer:
            df.to_excel(writer, sheet_name=sheet_name)

In [None]:
f1_score = []
rmse = []
acc = []
hamm = []

knn_mod = KNeighborsClassifier(n_neighbors=2, algorithm="auto", weights="uniform", p=1)
knn_mod.fit(train_data, train_labels)
acc.append(knn_mod.score(dev_data, dev_labels))
f1_score.append(metrics.f1_score(dev_labels, knn_mod.predict(dev_data), average="weighted"))
rmse.append(metrics.mean_squared_error(dev_labels, knn_mod.predict(dev_data), squared=False))
hamm.append(metrics.hamming_loss(dev_labels, knn_mod.predict(dev_data)))

### KNN Classifier

In [None]:
def knn_model(train_data, train_labels, dev_data, dev_labels, algorithm, weigth, klist):
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for k in klist:
        knn_mod = KNeighborsClassifier(n_neighbors=k, algorithm=algorithm, weights=weigth, p=1)
        knn_mod.fit(train_data, train_labels)
        acc.append(knn_mod.score(dev_data, dev_labels))
        f1_score.append(metrics.f1_score(dev_labels, knn_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, knn_mod.predict(dev_data), squared=False))
        hamm.append(metrics.hamming_loss(dev_labels, knn_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm
    
def knn_models(train_data, Y_train, dev_data, Y_dev):
    df_knn = pd.DataFrame()
    klist = [1, 2, 3, 4, 5, 6, 7,8,9, 10, 11, 12, 13, 14, 15]
    algorithm_list = ["auto", "ball_tree", "kd_tree", "brute"]
   
    weights = ["uniform", "distance"]
    df_knn["K"] = klist

    for algorithm in algorithm_list:
        df_knn[algorithm+"_f1"], df_knn[algorithm+"_rmse"], df_knn[algorithm+"_acc"], df_knn[algorithm+"_hamm"]  = knn_model(train_data, Y_train, dev_data, Y_dev, algorithm, weights[1], klist)
        
    print(df_knn)
    print("df_knn")
    return(df_knn)


### NB Classifier

In [None]:
def NB_model(train_data, train_labels, dev_data, dev_labels, alpha_list):
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for alpha in alpha_list:
        NB_mod = BernoulliNB(alpha=alpha)
        NB_mod.fit(train_data, train_labels)
        acc.append(NB_mod.score(dev_data, dev_labels))
        f1_score.append(metrics.f1_score(dev_labels, NB_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, NB_mod.predict(dev_data), squared=False))
        hamm.append(metrics.hamming_loss(dev_labels, NB_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def NB_models(train_data, Y_train, dev_data, Y_dev):
    alpha_list = [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]

    df_NB = pd.DataFrame()
    df_NB["Alpha"] = alpha_list
    df_NB["F1_score"], df_NB["RMSE"], df_NB["ACC"], df_NB["HAMM"] = NB_model(train_data, Y_train, dev_data, Y_dev, alpha_list)

    print(df_NB)
    print("df_NB")
    return(df_NB)


### Multinomial NB

In [None]:
def MNB_model(train_data, train_labels, dev_data, dev_labels, alpha_list):
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for alpha in alpha_list:
        MNB_mod = MultinomialNB(alpha=alpha)
        MNB_mod.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, MNB_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, MNB_mod.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, MNB_mod.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, MNB_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def MNB_models(train_data, Y_train, dev_data, Y_dev):
    alpha_list = [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]

    df_MNB = pd.DataFrame()
    df_MNB["Alpha"] = alpha_list
    df_MNB["F1_score"], df_MNB["RMSE"], df_MNB["ACC"], df_MNB["HAMM"] = MNB_model(train_data, Y_train, 
                                                                                  dev_data, Y_dev, alpha_list)

    print(df_MNB)
    print("df_MNB")
    return(df_MNB)

### Gaussian NB (only classification)

In [None]:
def GNB_model(train_data, train_labels, dev_data, dev_labels, smoothing_list):
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for var_smoothing in smoothing_list:
        GNB_mod = GaussianNB(var_smoothing=var_smoothing)
        GNB_mod.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, GNB_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, GNB_mod.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, GNB_mod.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, GNB_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def GNB_models(train_data, Y_train, dev_data, Y_dev):
    smoothing_list = [1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]

    df_GNB = pd.DataFrame()
    df_GNB["Var Smooth"] = smoothing_list
    df_GNB["F1_score"], df_GNB["RMSE"], df_GNB["ACC"], df_GNB["HAMM"] = GNB_model(train_data, Y_train, dev_data, Y_dev, smoothing_list)

    print(df_GNB)
    print("df_GNB")
    return(df_GNB)

### LogisticRegression
Warning The choice of the algorithm depends on the penalty chosen: Supported penalties by solver:  
- ‘newton-cg’ - [‘l2’, ‘none’]  
- ‘lbfgs’ - [‘l2’, ‘none’]  
- ‘liblinear’ - [‘l1’, ‘l2’]  
- ‘sag’ - [‘l2’, ‘none’]  
- ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]

**max_iter was increased to 200, so it would converge**
- max_iter int, default=100
- Maximum number of iterations taken for the solvers to converge.

In [None]:
def LogR_model(train_data, train_labels, dev_data, dev_labels, penalty, solver, c_list):
    
    
    #c_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5]
    
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for c in c_list:
        logR_mod = LogisticRegression(C=c, solver=solver, multi_class="auto", penalty=penalty, max_iter=200)
        logR_mod.fit(train_data, train_labels)
        acc.append(logR_mod.score(dev_data, dev_labels))
        f1_score.append(metrics.f1_score(dev_labels, logR_mod.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, logR_mod.predict(dev_data), squared=False))
        hamm.append(metrics.hamming_loss(dev_labels, logR_mod.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def LogR_models(train_data, Y_train, dev_data, Y_dev):
    df_logR =pd.DataFrame()
    solver_list = ["liblinear", "newton-cg", "sag", "lbfgs"]
    c_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

    df_logR["C"] = c_list
    for solver in solver_list:
#         df_logR[solver] = LogR_model(train_data, Y_train, dev_data, Y_dev, "l2", solver, c_list)
        df_logR[solver+"_f1"], df_logR[solver+"_rmse"], df_logR[solver+"_acc"], df_logR[solver+"_hamm"]= LogR_model(train_data, Y_train, dev_data, Y_dev, "l2", solver, c_list)

    print(df_logR)
    print("df_LogR")
    return(df_logR)

### Tree (Regression) - 

In [None]:
def DT_model(train_data, train_labels, dev_data, dev_labels, criterion, max_depth_list):
    
    
    #c_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5]
    
    f1_score = []
    rmse=[]
    acc = []
    hamm = []
    for max_depth in max_depth_list:
        dt_model = DecisionTreeClassifier(criterion=criterion, min_samples_split=10, max_depth=max_depth)
#         dt_model = DecisionTreeRegressor(criterion=criterion, min_samples_split=10, max_depth=max_depth)
        dt_model.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, dt_model.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, dt_model.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, dt_model.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, dt_model.predict(dev_data)))       
    return f1_score, rmse, acc, hamm

def DT_models(train_data, Y_train, dev_data, Y_dev):
    df_DT =pd.DataFrame()
    criterion_list = ["entropy", "gini"]
    max_depth_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    df_DT["max_depth"] = max_depth_list

    for criterion in criterion_list:
#         df_DT[criterion] = DT_model(train_data, Y_train, dev_data, Y_dev, criterion, max_depth_list)
        df_DT[criterion+"_f1"], df_DT[criterion+"_rmse"], df_DT[criterion+"_acc"], df_DT[criterion+"_hamm"]= DT_model(train_data, 
                                                                                                                      Y_train, dev_data, 
                                                                                                                      Y_dev, criterion, max_depth_list)

    print(df_DT)
    print("df_DT")
    return(df_DT)

### Random Forest (Regression)

In [None]:
def RF_model(train_data, train_labels, dev_data, dev_labels, criterion, n_estimators_list):
        
    f1_score = []
    rmse=[]
    acc = []
    hamm = []
    for n_estimators in n_estimators_list:
        RF_model = RandomForestClassifier(n_estimators=n_estimators,criterion=criterion, min_samples_split=10)
        RF_model.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, RF_model.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, RF_model.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, RF_model.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, RF_model.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def RF_models(train_data, Y_train, dev_data, Y_dev):
    df_RF =pd.DataFrame()
    criterion_list = ["entropy", "gini"]
    n_estimators_list = [5, 10, 15, 20, 25, 30]
    df_RF["n_estimators"] = n_estimators_list

    for criterion in criterion_list:
#         df_RF[criterion] = RF_model(train_data, Y_train, dev_data, Y_dev, criterion, n_estimators_list)
        df_RF[criterion+"_f1"], df_RF[criterion+"_rmse"], df_RF[criterion+"_acc"], df_RF[criterion+"_hamm"]= RF_model(train_data, Y_train, dev_data, Y_dev, 
                                                                   criterion, n_estimators_list)

    print(df_RF)
    print("df_RF")
    return(df_RF)

### AdaBoost (Regression)

In [None]:
def AdaB_model(train_data, train_labels, dev_data, dev_labels, algorithm, n_estimators_list):
        
    f1_score = []
    rmse=[]
    acc = []
    hamm = []
    for n_estimators in n_estimators_list:
        AdaB_model = AdaBoostClassifier(n_estimators=n_estimators,algorithm=algorithm, learning_rate=1.2)
        AdaB_model.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, AdaB_model.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, AdaB_model.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, AdaB_model.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, AdaB_model.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def AdaB_models(train_data, Y_train, dev_data, Y_dev):
    df_AdaB =pd.DataFrame()
    algorithm_list = ["SAMME", "SAMME.R"]
    n_estimators_list = [5, 10, 15, 20, 25, 30]
    df_AdaB["n_estimators"] = n_estimators_list

    for algorithm in algorithm_list:
        df_AdaB[algorithm+"_f1"], df_AdaB[algorithm+"_rmse"], df_AdaB[algorithm+"_acc"], df_AdaB[algorithm+"_hamm"]= AdaB_model(train_data, Y_train, 
                                                                         dev_data, Y_dev, algorithm, n_estimators_list)

    print(df_AdaB)
    print("df_AdaB")
    return(df_AdaB)

### SVM  

In [None]:
def SVM_model(train_data, train_labels, dev_data, dev_labels, kernel, c_list):
        
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    for c in c_list:
        if kernel == "LinearSVC":
            svm_model = svm.LinearSVC(C=c, max_iter=10000)
        elif kernel == "poly":
            svm_model = svm.SVC(kernel=kernel, C=c, degree=2, gamma=1)
        elif kernel == "rbf":
            svm_model = svm.SVC(kernel=kernel, C=c, gamma=0.7)
        else:
            svm_model = svm.SVC(kernel=kernel, C=c,)
        
        svm_model.fit(train_data, train_labels)
        f1_score.append(metrics.f1_score(dev_labels, svm_model.predict(dev_data), average="weighted"))
        rmse.append(metrics.mean_squared_error(dev_labels, svm_model.predict(dev_data), squared=False))
        acc.append(metrics.accuracy_score(dev_labels, svm_model.predict(dev_data)))
        hamm.append(metrics.hamming_loss(dev_labels, svm_model.predict(dev_data)))
    return f1_score, rmse, acc, hamm

def SVM_models(train_data, Y_train, dev_data):
    df_SVM =pd.DataFrame()
    kernel_list = ["linear", "rbf", "poly", "LinearSVC"]
    c_list = [0.5, 1, 1.5, 2, 2.5, 3, 4, 5, 10, 20]
    df_SVM["C"] = c_list

    for kernel in kernel_list:
        df_SVM[kernel+"_f1"], df_SVM[kernel+"_rmse"], df_SVM[kernel+"_acc"], df_SVM[kernel+"_hamm"] = SVM_model(train_data, Y_train, dev_data, Y_dev, kernel, c_list)
    print(df_SVM)
    print("df_SVM")

    return(df_SVM)

### Neural Network

**hidden_layer_sizestuple, length = n_layers - 2, default=(100,)**  
The ith element represents the number of neurons in the ith hidden layer.

**activation{‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’**  
Activation function for the hidden layer.  

- ‘identity’, no-op activation, useful to implement linear bottleneck, returns f(x) = x  
- ‘logistic’, the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).  
- ‘tanh’, the hyperbolic tan function, returns f(x) = tanh(x).   
- ‘relu’, the rectified linear unit function, returns f(x) = max(0, x)  

**solver{‘lbfgs’, ‘sgd’, ‘adam’}, default=’adam’**  
The solver for weight optimization.  

- ‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
- ‘sgd’ refers to stochastic gradient descent.
- ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba

Note: The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, ‘lbfgs’ can converge faster and perform better.

**alphafloat, default=0.0001**
L2 penalty (regularization term) parameter.

In [None]:
def NN_model(train_data, train_labels, dev_data, dev_labels, activation, solver_list, alpha_list, layer_list, choice):
        
    f1_score = []
    rmse = []
    acc = []
    hamm = []
    if choice == "A":
        for alpha in alpha_list:
            NN_model = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000, activation=activation, alpha=alpha)
            NN_model.fit(train_data, train_labels)
            f1_score.append(metrics.f1_score(dev_labels, NN_model.predict(dev_data), average="weighted"))
            rmse.append(metrics.mean_squared_error(dev_labels, NN_model.predict(dev_data), squared=False))
            acc.append(metrics.accuracy_score(dev_labels, NN_model.predict(dev_data)))
            hamm.append(metrics.hamming_loss(dev_labels, NN_model.predict(dev_data)))
    elif choice == "L":
        for layer in layer_list:
            NN_model = MLPClassifier(hidden_layer_sizes=layer, max_iter=1000, activation=activation)
            NN_model.fit(train_data, train_labels)
            f1_score.append(metrics.f1_score(dev_labels, NN_model.predict(dev_data), average="weighted"))
            rmse.append(metrics.mean_squared_error(dev_labels, NN_model.predict(dev_data), squared=False))
            acc.append(metrics.accuracy_score(dev_labels, NN_model.predict(dev_data)))
            hamm.append(metrics.hamming_loss(dev_labels, NN_model.predict(dev_data)))
    else:
        for solver in solver_list:
            NN_model = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=8000, activation=activation, solver=solver)
            NN_model.fit(train_data, train_labels)
            f1_score.append(metrics.f1_score(dev_labels, NN_model.predict(dev_data), average="weighted"))
            rmse.append(metrics.mean_squared_error(dev_labels, NN_model.predict(dev_data), squared=False))
            acc.append(metrics.accuracy_score(dev_labels, NN_model.predict(dev_data)))
            hamm.append(metrics.hamming_loss(dev_labels, NN_model.predict(dev_data)))
    
    return f1_score, rmse, acc, hamm

#Note: Changing Alpha is not creating any variation in the f1_score.  Try first with L and then with S
def NN_models(train_data, Y_train, dev_data, Y_dev, choice):
    
    df_NN =pd.DataFrame()
    activation_list = ["identity", "logistic", "tanh", "relu"]
    layer_list = [(10,10,10), (5,5,5), (3,3,3), (20, 20, 20)]
    solver_list = ["lbfgs", "sgd", "adam"]
    alpha_list = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5]

    for activation in activation_list:
        df_NN[activation+'_f1'], df_NN[activation+'_rmse'], df_NN[activation+'_acc'], df_NN[activation+'_hamm'] = NN_model(train_data, Y_train, dev_data, Y_dev, activation, solver_list, alpha_list, layer_list, choice)

    if choice == "A":
        df_NN["Alpha"] = alpha_list
    elif choice == "L":
        df_NN["Layers"] = layer_list
    else:
        df_NN["Solver"] = solver_list
    
    print(df_NN)
    print("df_NN")
    return(df_NN)

In [None]:
def assign_y(scale):
    if scale == 2:
        Y_train = train_y2
        Y_dev = dev_y2
    elif scale == 3:
        Y_train = train_y3
        Y_dev = dev_y3
    elif scale == 4:
        Y_train = train_y4
        Y_dev = dev_y4
    elif scale == 5:
        Y_train = train_y5
        Y_dev = dev_y5
    elif scale == 10:
        Y_train = train_y10
        Y_dev = dev_y10
    elif scale == 100:
        Y_train = train_labels
        Y_dev = dev_labels
    else:
        Y_train = train_labels
        Y_dev = dev_labels
    return(Y_train, Y_dev)        

In [None]:
def print_confusion_matrix(Y_dev, Prediction, title):
    cfm = confusion_matrix(Y_dev,Prediction)
    if np.unique(Y_dev).max() > 5:
        size = 6
    else: 
        size = np.unique(Y_dev).max()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(cfm, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(cfm.shape[0]):
        for j in range(cfm.shape[1]):
            ax.text(x=j, y=i,s=cfm[i, j], va='center', ha='center', size='xx-large')
    ax.set_title(title)

In [None]:
scale_list = [2, 5, 10, 100]
file_name = "model_summary_baseline_final.xlsx"
for scale in scale_list:
    print(scale)
    Y_train, Y_dev = assign_y(scale)
    df_knn = knn_models(train_data, Y_train, dev_data, Y_dev)
    df_NB = NB_models(train_data, Y_train, dev_data, Y_dev)
    df_MNB = MNB_models(train_data, Y_train, dev_data, Y_dev)
    df_GNB = GNB_models(train_data, Y_train, dev_data, Y_dev)
    df_logR = LogR_models(train_data, Y_train, dev_data, Y_dev)
    df_DT = DT_models(train_data, Y_train, dev_data, Y_dev)
    df_RF = RF_models(train_data, Y_train, dev_data, Y_dev)
    df_AdaB = AdaB_models(train_data, Y_train, dev_data, Y_dev)
    df_SVM = SVM_models(train_data, Y_train, dev_data)
    df_NN1 = NN_models(train_data, Y_train, dev_data, Y_dev, "L")
    df_NN2 = NN_models(train_data, Y_train, dev_data, Y_dev, "S")

#     wrt_excel(file_name, "knn-"+str(scale), df_knn)
#     wrt_excel(file_name, "NB-"+str(scale), df_NB)    
#     wrt_excel(file_name, "MNB-"+str(scale), df_MNB)
#     wrt_excel(file_name, "GNB-"+str(scale), df_GNB)
#     wrt_excel(file_name, "logR-"+str(scale), df_logR)
#     wrt_excel(file_name, "DT-"+str(scale), df_DT)
#     wrt_excel(file_name, "RF-"+str(scale), df_RF)
#     wrt_excel(file_name, "AdaB-"+str(scale), df_AdaB)
#     wrt_excel(file_name, "SVM-"+str(scale), df_SVM)
#     wrt_excel(file_name, "NN-"+str(scale)+"L", df_NN1)
#     wrt_excel(file_name, "NN-"+str(scale)+"S", df_NN2)

## PCA 7 components > IN ADDITION TO ABOVE

In [None]:
components_pca = 7
random_state = 0

pca = PCA(n_components=components_pca, random_state=random_state)
pca.fit(train_data)

# overwriting train_data and dev_data to be the pca object - should do this in a cleaner way to preserve it but giving this a shot
train_data1 = pca.transform(train_data)
dev_data1 = pca.transform(dev_data)


In [None]:
scale_list = [2, 3, 4, 5, 10, 100]
file_name = "model_summary_w_pca_7_final.xlsx"
for scale in scale_list:
    print(scale)
    Y_train, Y_dev = assign_y(scale)
    df_knn = knn_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_NB = NB_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_MNB = MNB_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_GNB = GNB_models(train_data1, Y_train, dev_data1, Y_dev)
    df_logR = LogR_models(train_data1, Y_train, dev_data1, Y_dev)
    df_DT = DT_models(train_data1, Y_train, dev_data1, Y_dev)
    df_RF = RF_models(train_data1, Y_train, dev_data1, Y_dev)
    df_AdaB = AdaB_models(train_data1, Y_train, dev_data1, Y_dev)
    df_SVM = SVM_models(train_data1, Y_train, dev_data1)
    df_NN1 = NN_models(train_data1, Y_train, dev_data1, Y_dev, "L")
    df_NN2 = NN_models(train_data1, Y_train, dev_data1, Y_dev, "S")

    wrt_excel(file_name, "knn-"+str(scale), df_knn)
#     wrt_excel(file_name, "NB-"+str(scale), df_NB)    
#     wrt_excel(file_name, "MNB-"+str(scale), df_MNB)
#     wrt_excel(file_name, "GNB-"+str(scale), df_GNB)
    wrt_excel(file_name, "logR-"+str(scale), df_logR)
    wrt_excel(file_name, "DT-"+str(scale), df_DT)
    wrt_excel(file_name, "RF-"+str(scale), df_RF)
    wrt_excel(file_name, "AdaB-"+str(scale), df_AdaB)
    wrt_excel(file_name, "SVM-"+str(scale), df_SVM)
    wrt_excel(file_name, "NN-"+str(scale)+"L", df_NN1)
    wrt_excel(file_name, "NN-"+str(scale)+"S", df_NN2)

## PCA 8 components 

In [None]:
components_pca = 8
random_state = 0

pca = PCA(n_components=components_pca, random_state=random_state)
pca.fit(train_data)

# overwriting train_data and dev_data to be the pca object - should do this in a cleaner way to preserve it but giving this a shot
train_data2 = pca.transform(train_data)
dev_data2 = pca.transform(dev_data)

In [None]:
scale_list = [2, 3, 4, 5, 10, 100]
file_name = "model_summary_w_pca_8_final.xlsx"
for scale in scale_list:
    print(scale)
    Y_train, Y_dev = assign_y(scale)
    df_knn = knn_models(train_data2, Y_train, dev_data2, Y_dev)
#     df_NB = NB_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_MNB = MNB_models(train_data1, Y_train, dev_data1, Y_dev)
#     df_GNB = GNB_models(train_data1, Y_train, dev_data1, Y_dev)
    df_logR = LogR_models(train_data2, Y_train, dev_data2, Y_dev)
    df_DT = DT_models(train_data2, Y_train, dev_data2, Y_dev)
    df_RF = RF_models(train_data2, Y_train, dev_data2, Y_dev)
    df_AdaB = AdaB_models(train_data2, Y_train, dev_data2, Y_dev)
    df_SVM = SVM_models(train_data2, Y_train, dev_data2)
    df_NN1 = NN_models(train_data2, Y_train, dev_data2, Y_dev, "L")
    df_NN2 = NN_models(train_data2, Y_train, dev_data2, Y_dev, "S")

    wrt_excel(file_name, "knn-"+str(scale), df_knn)
#     wrt_excel(file_name, "NB-"+str(scale), df_NB)    
#     wrt_excel(file_name, "MNB-"+str(scale), df_MNB)
#     wrt_excel(file_name, "GNB-"+str(scale), df_GNB)
    wrt_excel(file_name, "logR-"+str(scale), df_logR)
    wrt_excel(file_name, "DT-"+str(scale), df_DT)
    wrt_excel(file_name, "RF-"+str(scale), df_RF)
    wrt_excel(file_name, "AdaB-"+str(scale), df_AdaB)
    wrt_excel(file_name, "SVM-"+str(scale), df_SVM)
    wrt_excel(file_name, "NN-"+str(scale)+"L", df_NN1)
    wrt_excel(file_name, "NN-"+str(scale)+"S", df_NN2)