In [3]:
import pandas as pd
import numpy as np
import scipy.sparse

from sklearn import datasets
from sklearn.model_selection import train_test_split # cross validation 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

In [4]:
import csv
# Load the features and the labels from the .npz and .csv files
# feats  = training features (sparse matrix)
# labels = labels (vector)

feats  = scipy.sparse.load_npz('../data/corpus_feature_vectors.npz')
labels = []
with open('../data/corpus_labels.csv', encoding='utf-8') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        labels.append(row[0])

print(feats.shape)
print(len(labels)) 

(334295, 3800)
334295


In [5]:
# 'test_train_split' does the cross validation
test_size=0.3

feats_train, feats_test, labels_train, labels_test = train_test_split(feats,
                                                                     labels,
                                                                     test_size=test_size)

In [6]:
# Feature standardization can be handled by 'StandardScaler'
# Note: We are only going to fit the standard scalar to the 
#       training set because we won't be able to fit the data
#       to the test set when we test. 


# create the instance
sc = StandardScaler(with_mean=False)
sc.fit(feats)     # fit to training features

# This transforms the training and test sets based off the 
# scalers that we got from the standardization on training set. 
feats_train_std = sc.transform(feats_train)
feats_test_std = sc.transform(feats_test)
feats_transformed  = sc.transform(feats)

In [9]:
import time
from sklearn import svm
print ("Started svm training: ", time.asctime( time.localtime(time.time()) ))

model = svm.LinearSVC(class_weight='balanced',max_iter=2000)
model.fit(feats_transformed, labels)

print ("ended svm training: ", time.asctime( time.localtime(time.time()) ))

labels_pred = model.predict(feats_test_std)
print("accuracy: {0:.2f}%".format(accuracy_score(labels_test,labels_pred)*100))

Started svm training:  Wed Dec  4 20:07:03 2019
ended svm training:  Wed Dec  4 20:29:47 2019
accuracy: 60.50%


In [10]:
import pickle
fileObject = open("pickled_svm", 'wb')
pickle.dump(model, fileObject)
fileObject.close()
