### Logistic Regression Training Script
#### Author: Austin Fernandez
#### Date Modified: April 25, 2020

In [1]:
# dependencies
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
# from sklearn.naive_bayes import ComplementNB, MultinomialNB
import numpy as np
from scipy.stats import uniform, norm
from sklearn.metrics import plot_confusion_matrix
from imblearn.over_sampling import SMOTE, RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = []    
f = open("HighAndLowFeatures(edited).csv","r")
d_reader = csv.reader(f,delimiter=",",quotechar="\"")
first = True
for line in d_reader:
    if first:
        first = False
        continue
    data.append(line)
print(np.shape(data))

(267, 343)


In [3]:
x_train = np.empty((0,339))
y_train = np.array([])

for line in data:
    x_train = np.append(x_train,np.array(list(map(float,line[1:-3]))).reshape((1,339)),axis=0)
    y_train = np.append(y_train,int(line[-1]))
print(x_train.shape)
print(y_train.shape)
#print(y_train)

(267, 339)
(267,)


In [4]:
rand_seed = 3454132

oversampler = SMOTE(sampling_strategy="not majority",random_state=rand_seed)

x_smote, y_smote = oversampler.fit_resample(x_train,y_train)

print(x_smote.shape,y_smote.shape)

oversampler = RandomOverSampler(sampling_strategy="not majority",random_state=rand_seed)

x_os, y_os = oversampler.fit_resample(x_train,y_train)

print(x_os.shape,y_os.shape)

(480, 339) (480,)
(480, 339) (480,)


In [5]:
"""
Trains a model using the given data and a hyperparameter search object

Parameters:
x_train - input data
y_train - target labels for data
hp_search - model_selection object

Returns: best estimator for the given data given the model selector
"""
def train_model(x_train,y_train,hp_search):
    hp_search.fit(x_train,y_train)
    print("Best Score: {:.4f}".format(hp_search.best_score_))
    for k,v in hp_search.best_params_.items():
        print("{} => {}".format(k,v))
    print("Splits: {}".format(hp_search.n_splits_))
    y_out = hp_search.predict(x_train)
    print("Accuracy: {:.4f}%".format(np.mean(y_out == y_train) * 100.0))
    return hp_search.best_estimator_

In [6]:
# LogisticRegression, RandomizedSearch
random_search_iterations = 200
k_folds = 5
seed = 48151623

parameters = {
    'penalty':['l1','l2'], 
    'C': uniform(loc=0,scale=4),
    'fit_intercept' : [True,False],
    'solver' : ['liblinear', 'saga'],
    'max_iter' : [100,200,300,400,500,600,700,800,900,1000]
}
mnb = LogisticRegression()
random_search_logreg = RandomizedSearchCV(mnb, parameters,cv=k_folds,n_iter=random_search_iterations,random_state=seed)

In [None]:
print("Random Oversampling:")
train_model(x_os,y_os,random_search_logreg)
disp = plot_confusion_matrix(random_search_logreg, x_os, y_os,
                             display_labels=["Calm","Cheerful","Bravery","Fearful","Sadness","Love"],
                             cmap=plt.cm.Blues,
                             normalize='true')

Random Oversampling:


In [None]:
print("SMOTE:")
train_model(x_smote,y_smote,random_search_logreg)
disp = plot_confusion_matrix(random_search_logreg, x_smote, y_smote,
                             display_labels=["Calm","Cheerful","Bravery","Fearful","Sadness","Love"],
                             cmap=plt.cm.Blues,
                             normalize='true')

In [None]:
# MLPClassifier, RandomizedSearch
random_search_iterations = 200
k_folds = 5
seed = 48151623

parameters = {
    'activation':['logistic', 'tanh', 'relu'], 
    'solver' : ['lbfgs', 'sgd', 'adam'],
    'alpha': [1 / np.power(10,x) for x in range(1,10)],
    'batch_size' : [min(200,int(np.power(2,i))) for i in range(4,8)],
    'learning_rate' : ['constant', 'invscaling', 'adaptive'],
    'max_iter' : [100,200,300,400,500],
    'shuffle' : [True,False],
    'momentum' : uniform(loc=0.2,scale=0.8),
    'nesterovs_momentum' : [True,False],
    'early_stopping' : [True,False]
}
mlp = MLPClassifier()
random_search_mlp = RandomizedSearchCV(mlp, parameters,cv=k_folds,n_iter=random_search_iterations,random_state=seed, n_jobs=-1)
random_search_mlp.fit(x_train,y_train)
print("Best Score: {:.4f}".format(random_search_mlp.best_score_))
for k,v in random_search_mlp.best_params_.items():
    print("{} => {}".format(k,v))
print("Splits: {}".format(random_search_mlp.n_splits_))
y_out = rscv.predict(x_train)
print("Accuracy: {:.4f}%".format(np.mean(y_out == y_train) * 100.0))
# print(y_out)

In [None]:
print("Random Oversampling:")
train_model(x_os,y_os,random_search_mlp)
disp = plot_confusion_matrix(random_search_mlp, x_os, y_os,
                             display_labels=["Calm","Cheerful","Bravery","Fearful","Sadness","Love"],
                             cmap=plt.cm.Blues,
                             normalize='true')

In [None]:
print("SMOTE:")
train_model(x_smote,y_smote,random_search_mlp)
disp = plot_confusion_matrix(random_search_mlp, x_smote, y_smote,
                             display_labels=["Calm","Cheerful","Bravery","Fearful","Sadness","Love"],
                             cmap=plt.cm.Blues,
                             normalize='true')