In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
import csv
from sklearn import datasets
from sklearn.model_selection import train_test_split # cross validation 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

In [2]:
# Load the features and the labels from the .npz and .csv files
# feats  = training features (sparse matrix)
# labels = labels (vector)

feats  = scipy.sparse.load_npz('../data/corpus_feature_vectors.npz')
labels = []
with open('../data/corpus_labels.csv', encoding='utf-8') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        labels.append(row[0])

print(feats.shape)
print(len(labels)) 

(334295, 3800)
334295


In [3]:
# TODO: reapply cross validation here once a single test works as intended.
# 'test_train_split' does the cross validation
test_size=0.3

feats_train, feats_test, labels_train, labels_test = train_test_split(feats,
                                                                     labels,
                                                                     test_size=test_size)

In [5]:
# Feature standardization can be handled by 'StandardScaler'
# Note: We are only going to fit the standard scalar to the 
#       training set because we won't be able to fit the data
#       to the test set when we test. 


# create the instance
sc = StandardScaler(with_mean=False)
sc.fit(feats)     # fit to training features

# This transforms the training and test sets based off the 
# scalers that we got from the standardization on training set. 
feats_train_std = sc.transform(feats_train)
feats_test_std = sc.transform(feats_test)
feats_transformed  = sc.transform(feats)

In [7]:
import time
from sklearn.neighbors import NearestCentroid

print ("Perceptron: Local current time :", time.asctime( time.localtime(time.time()) ))
# Training the model using perceptron
n_iter = 5 # 40 iterations of perceptron

# create the perceptron instance
perceptron = Perceptron(penalty='elasticnet', alpha=0.00008,n_iter_no_change=n_iter, class_weight='balanced')
nc = NearestCentroid(metric='euclidean')

# fit the model to the standardization data
perceptron.fit(feats_transformed, labels)
nc.fit(feats_transformed, labels)

# make predictions
labels_pred = perceptron.predict(feats_test_std)
labels_pred2 = nc.predict(feats_test_std)
print ("Ending perceptron: Local current time :", time.asctime( time.localtime(time.time()) ))

print("accuracy for percep: {0:.2f}%".format(accuracy_score(labels_test,labels_pred)*100))
print("accuracy for nc: {0:.2f}%".format(accuracy_score(labels_test,labels_pred2)*100))

Perceptron: Local current time : Wed Dec  4 14:02:55 2019
Ending perceptron: Local current time : Wed Dec  4 14:03:36 2019
accuracy for percep: 53.48%
accuracy for nc: 53.48%


In [8]:
print("accuracy for nc: {0:.2f}%".format(accuracy_score(labels_test,labels_pred2)*100))

accuracy for nc: 58.07%


In [9]:
from collections import Counter
print(Counter(labels_pred))
print(Counter(labels_pred2))

Counter({'5': 43250, '4': 17568, '1': 17349, '3': 12661, '2': 9461})
Counter({'5': 35079, '4': 25176, '1': 15854, '3': 13066, '2': 11114})


In [10]:
import pickle
fileObject = open("pickled_perceptron", 'wb')
pickle.dump(perceptron, fileObject)
fileObject.close()

fileObject = open("pickled_nearest_centroid", 'wb')
pickle.dump(nc, fileObject)
fileObject.close()