In [12]:
import pickle
import prepare_functions
import train_functions

In [13]:
# loading the saved count vectorizer
with open('bow_transformer.pickle', 'rb') as f:
    bow_transformer = pickle.load(f)

In [14]:
# loading the datasets
train_data = prepare_functions.load_data("data/train.csv", separator=',')
validation_data = prepare_functions.load_data("data/validation.csv", separator=',')
test_data = prepare_functions.load_data("data/test.csv", separator=',')

print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)

(3900, 2)
(836, 2)
(836, 2)


In [15]:
# preprocessing the datasets, converting into sparse matrices
train_matrix, train_labels = prepare_functions.preprocess(train_data, bow_transformer)
validation_matrix, validation_labels = prepare_functions.preprocess(validation_data, bow_transformer)
test_matrix, test_labels = prepare_functions.preprocess(test_data, bow_transformer)

print(train_matrix.shape)
print(validation_matrix.shape)
print(test_matrix.shape)

(3900, 8731)
(836, 8731)
(836, 8731)


In [16]:
# importing modules for the three classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [17]:
# training the classifiers on the training data
naive_bayes = MultinomialNB()
logistic_regression = LogisticRegression(random_state=1)
random_forest = RandomForestClassifier(random_state=1)

train_functions.train_model(naive_bayes, train_matrix, train_labels)
train_functions.train_model(logistic_regression, train_matrix, train_labels)
train_functions.train_model(random_forest, train_matrix, train_labels)

In [18]:
# evaluating the classifiers on the validation dataset
train_functions.evaluate_model(naive_bayes, validation_matrix, validation_labels)
train_functions.evaluate_model(logistic_regression, validation_matrix, validation_labels)
train_functions.evaluate_model(random_forest, validation_matrix, validation_labels)

accuracy:  0.9473684210526315
precision:  1.0
recall:  0.6173913043478261
confusion matrix:
 [[721   0]
 [ 44  71]]
-----x-----x-----

accuracy:  0.965311004784689
precision:  0.9886363636363636
recall:  0.7565217391304347
confusion matrix:
 [[720   1]
 [ 28  87]]
-----x-----x-----

accuracy:  0.972488038277512
precision:  1.0
recall:  0.8
confusion matrix:
 [[721   0]
 [ 23  92]]
-----x-----x-----



In [19]:
# the classifier with the best validation accuracy is the
# random forest classifier. Reporting it's accuracy on the test data.
from sklearn.metrics import accuracy_score
predictions = random_forest.predict(test_matrix)
print("accuracy: ", accuracy_score(test_labels, predictions))

accuracy:  0.9677033492822966
