In [29]:
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn import datasets
from sklearn.model_selection import train_test_split # cross validation 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

In [62]:
# Load the features and the labels from the .npz and .csv files
# feats  = training features (sparse matrix)
# labels = labels (vector)
feats  = scipy.sparse.load_npz('../data/corpus_feature_vectors.npz')
labels = pd.read_csv('../data/corpus_labels.csv')
print(len(labels)) # There's an oopsies here. For some reason the length
                   # Is one less than it should be
    
# I'm adding this here just so I can go ahead with testing, but this
# is something that I need to ask.
labels = labels.append(pd.DataFrame([1],columns=['1']),ignore_index=True)
print(len(labels))

334294
334295


In [63]:
# 'test_train_split' does the cross validation
test_size=0.3

feats_train, feats_test, labels_train, labels_test = train_test_split(feats,
                                                                     labels,
                                                                     test_size=test_size)

In [69]:
# Feature standardization can be handled by 'StandardScaler'
# Note: We are only going to fit the standard scalar to the 
#       training set because we won't be able to fit the data
#       to the test set when we test. 


# create the instance
sc = StandardScaler(with_mean=False)
sc.fit(feats_train)     # fit to training features

# This transforms the training and test sets based off the 
# scalers that we got from the standardization on training set. 
feats_train_std = sc.transform(feats_train)
feats_test_std = sc.transform(feats_test)

In [97]:
print ("Perceptron: Local current time :", time.asctime( time.localtime(time.time()) ))
# Training the model using perceptron
n_iter = 40 # 40 iterations of perceptron

# create the perceptron instance
perceptron = Perceptron(n_iter_no_change=n_iter)

# fit the model to the standardization data
perceptron.fit(feats_train_std, labels_train)

# make predictions
labels_pred = perceptron.predict(feats_test_std)
print ("Ending perceptron: Local current time :", time.asctime( time.localtime(time.time()) ))

print("accuracy: {0:.2f}%".format(accuracy_score(labels_test,labels_pred)*100))

Perceptron: Local current time : Wed Dec  4 03:14:16 2019
Ending perceptron: Local current time : Wed Dec  4 03:15:08 2019
accuracy: 30.50%


In [91]:
print("accuracy: {0:.2f}%".format(accuracy_score(labels_test,labels_pred)*100))

accuracy: 30.13%


In [40]:
# Since the official dataset isn't ready yet, using sklearn's provided
# dataset for testing iris, and splitting the features and the labels.
# X = training features
# y = labels
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [11]:
# 'test_train_split' does the cross validation. 
test_size=0.3    # percentage of data that will be the dev set
random_state=0   # makes results reproducable. 

# return values X_train, X_test, y_train, and y_test produced
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                   random_state=random_state)

In [14]:
# Feature standardization can be handled by 'StandardScaler'
# Note: We are only going to fit the standard scalar to the 
#       training set because we won't be able to fit the data
#       to the test set when we test. 


# create the instance
sc = StandardScaler()
sc.fit(X_train)     # fit to training features

# This transforms the training and test sets based off the 
# scalers that we got from the standardization on training set. 
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

Unique labels: [0 1 2]


In [15]:
# Preprocessing and feature extraction
# What's here is a temporary one that I grabbed from online
# to test the perceptron algorithm.

# In this example, features 2 and 3 are chosen to look at.
X_train_std = X_train_std[:, [2,3]]
X_test_std = X_test_std[:,[2,3]]

In [23]:
# Training the model using perceptron
n_iter = 40 # 40 iterations of perceptron
eta0=0.1    # learning rate of our implementation

# create the perceptron instance
perceptron = Perceptron(n_iter_no_change=n_iter, eta0=eta0, random_state=random_state)

# fit the model to the standardization data
perceptron.fit(X_train_std, y_train)

# make predictions
y_pred = perceptron.predict(X_test_std)

In [24]:
print("accuracy: {0:.2f}%".format(accuracy_score(y_test,y_pred)*100))

accuracy: 91.11%
