In [1]:
#  This file experiments with Sklearn's logistic regression model and tuning the hyperparameters for the 
#     best (within the time constraints) preforming model as determined by cross-validation

import scipy.sparse
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import time;
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#  Load the training instances and corresponding labels
trainset = scipy.sparse.load_npz('../data/corpus_feature_vectors.npz')
labels = []
with open('../data/corpus_labels.csv', encoding='utf-8') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        labels.append(row[0])

print(trainset.shape)
print(len(labels))

Today is : Fri Dec  6 19:15:52 2019
(334295, 3800)
334295


In [2]:
#   Using Sklearn's standardization class, we normalize the training set feature values
#      so that no one feature is unfairly weighted due to scale differences.
#   Without this step, the accuracy of our models went down
sc = StandardScaler(with_mean=False)
sc.fit(trainset)
normalized_trainset = sc.transform(trainset)

In [3]:
# Training for default model without any hyperparameter tuning
model_default = LogisticRegression(class_weight='balanced', random_state=0, solver='saga', max_iter=200, multi_class='multinomial')
#model_default.fit(normalized_trainset, labels)

# Training for tuned model #1
# Differences:
#    Using elasticnet instead of l2 for regularization
#    Using l1 ratio 0.9
model_1 = LogisticRegression(penalty='elasticnet', class_weight='balanced', random_state=0, solver='saga', max_iter=200, multi_class='multinomial', l1_ratio=0.9)
#model_1.fit(normalized_trainset, labels) # so far the best


# Training for tuned model #2
# Differences:
#    Non-balanced training set
#    Using l1 ratio 0.9

model_2 = LogisticRegression(random_state=0, solver='saga', max_iter=200, multi_class='multinomial')
#model_2.fit(normalized_trainset, labels) 

In [4]:
#   A quick test on 80% of the train data for training and testing on the 20% held-out 
test_size=0.2

feats_train, feats_test, labels_train, labels_test = train_test_split(normalized_trainset,
                                                                     labels,
                                                                     test_size=test_size)
model_1.fit(feats_train, labels_train)
print(model_1.score(feats_test, labels_test))


0.6420676348733902


In [4]:
#   Used 3-fold cross validation to get a better accuracy measure of the logistic regression models with different hyperparameters

print ("Started 3-fold CV for default: ", time.asctime( time.localtime(time.time()) ))
# Evaluation of default model
scores = cross_val_score(model_default, normalized_trainset, labels, cv=3)
print ("Finished 3-fold CV for default: ", time.asctime( time.localtime(time.time()) ))
print(scores)

print ("Started 3-fold CV for model_1: ", time.asctime( time.localtime(time.time()) ))
# Evaluation of model_1 to check if overfitting
scores = cross_val_score(model_1, normalized_trainset, labels, cv=3)
print ("Started 3-fold CV for model_1: ", time.asctime( time.localtime(time.time()) ))
print(scores)

print ("Started 3-fold CV for model_2: ", time.asctime( time.localtime(time.time()) ))
# Evaluation of model_1 to check if overfitting
scores = cross_val_score(model_2, normalized_trainset, labels, cv=3)
print ("Started 3-fold CV for model_2: ", time.asctime( time.localtime(time.time()) ))
print(scores)

Started 3-fold CV for default:  Fri Dec  6 05:02:42 2019




Finished 3-fold CV for default:  Fri Dec  6 05:09:25 2019
[0.64223345 0.64022004 0.64374944]
Started 3-fold CV for model_1:  Fri Dec  6 05:09:25 2019




Started 3-fold CV for model_1:  Fri Dec  6 07:04:17 2019
[0.64229627 0.64045337 0.64391098]
Started 3-fold CV for model_2:  Fri Dec  6 07:04:17 2019




Started 3-fold CV for model_2:  Fri Dec  6 07:10:23 2019
[0.66913751 0.67060629 0.67119268]


In [7]:
#   Saved the trained model in a pickle file
import pickle
fileObject = open("pickled_logistic_regression_model", 'wb')
pickle.dump(model_1, fileObject)
fileObject.close()