#   Make sure to download the data_test_wo_label.json file from 
#      the CSE 142 website into the ./data folder in order for this 
#      code to run properly

#   Also, if you pulled this version you should have a pickled_feature_vectorizer
#      in your ./src folder

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk as nltk
import numpy as np 
import time;
import csv

In [23]:
result = pd.read_json('../data/data_test_wo_label.json')

# Collecting all the text reviews into a list called corpus. For now, our features will be from the text using sklearn's 
# packages, could add the useful, funny attributes later to see if improves
corpus = []
for index, row in result.iterrows():
    corpus.append(row['text'])
print(len(corpus)) # quick check

50000


In [24]:
ps = PorterStemmer()
test_stemmed = []
for text in corpus:
    words = word_tokenize(text)
    new_text = ""
    for word in words:
        stemmed = ps.stem(word)
        new_text = new_text + " " + stemmed
    test_stemmed.append(new_text)

In [25]:
import csv
#   A quick insert so the stemming can be done and ready to go without
#   rerunning the previous cell
row_list = []
for each in test_stemmed: #each row consists of one column which is the stemmed text
    text = []
    text.append(each)
    row_list.append(text)
print(len(row_list)) # quick check, needs to be 50000
with open('../data/data_test_wo_labels_stemmed.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(row_list)

50000


In [26]:
#   A quick insert so the stemming can be done and ready to go without
#   rerunning the previous cell
#   START HERE
test_stemmed = []
with open('../data/data_test_wo_labels_stemmed.csv', encoding='utf-8') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        test_stemmed.append(row[0])

In [27]:
import pickle
#unload pickled vectorizer with prelearned features from training set
fileObject = open("pickled_feature_vectorizer",'rb')  
# load the object from the file into var b
vectorizer = pickle.load(fileObject)  
fileObject.close()
# print(len(vectorizer.get_feature_names()))
feature_vector = vectorizer.transform(test_stemmed)




In [28]:
#   normalize test feature vectors
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
sc.fit(feature_vector)
normalized_feature_vector = sc.transform(feature_vector)

In [29]:
import scipy.sparse
scipy.sparse.save_npz('../data/data_test_wo_labels_feature_vectors_normalized.npz', normalized_feature_vector)

In [30]:
# Starting here out will be different

In [31]:
import scipy.sparse
import time
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [32]:
labels = []

In [33]:
testset = scipy.sparse.load_npz('../data/data_test_wo_labels_feature_vectors_normalized.npz') #just incase transform takes a while
# unload pickled learned models LR, Perceptron, Nearest_Centroid, SVM(maybe)

fileObject = open("pickled_logistic_regression_model",'rb')
log_reg = pickle.load(fileObject)
fileObject.close()

fileObject = open("pickled_perceptron",'rb')
perceptron = pickle.load(fileObject)
fileObject.close()

fileObject = open("pickled_nearest_centroid",'rb')
nc = pickle.load(fileObject)
fileObject.close()

fileObject = open("pickled_svm", 'rb')
svm = pickle.load(fileObject)
fileObject.close()

predictions_LR = log_reg.predict(testset)
predictions_p = perceptron.predict(testset)
predictions_nc = nc.predict(testset)
predictions_svm = svm.predict(testset)

#   Some stats on the predictions
from collections import Counter
print("LR: ", Counter(predictions_LR))
print("PR: ", Counter(predictions_p))
print("NC: ", Counter(predictions_nc))
print("SVM: ", Counter(predictions_svm))

labels.append(predictions_LR)
labels.append(predictions_p)
labels.append(predictions_nc)
labels.append(predictions_svm)




LR:  Counter({'5': 18903, '4': 11986, '1': 7316, '3': 6365, '2': 5430})
PR:  Counter({'5': 21623, '1': 8800, '4': 8675, '3': 6348, '2': 4554})
NC:  Counter({'5': 17393, '4': 12499, '1': 8022, '3': 6572, '2': 5514})
SVM:  Counter({'5': 23783, '4': 10955, '1': 7160, '3': 5206, '2': 2896})


In [34]:
print(type(labels[0]))

<class 'numpy.ndarray'>


In [35]:
print(labels[0].size)

50000


In [36]:
# voting/ensemble method
final_predictions = [] # then, write to a csv for final predictions
for instance in range(labels[0].size):
    count1 = 0
    count2 = 0
    count3 = 0
    count4 = 0
    count5 = 0
    
    for model in range(4):
        vote = labels[model].item(instance)
        if(vote == '1'):
            count1 = count1+1
        if(vote == '2'):
            count2 = count2+1
        if(vote == '3'):
            count3 = count3+1
        if(vote == '4'):
            count4 = count4+1
        if(vote == '5'):
            count5 = count5+1


    votes = [count1, count2, count3, count4, count5]
    winner_index = 0
    for v in range(len(votes)):
        if(votes[v] > votes[winner_index]):
            winner_index = v
    final_predictions.append(float(winner_index+1))
             
print(Counter(final_predictions))
final_predictions

Counter({5.0: 19105, 4.0: 11752, 1.0: 8831, 3.0: 5939, 2.0: 4373})


[4.0,
 5.0,
 4.0,
 4.0,
 4.0,
 1.0,
 5.0,
 5.0,
 5.0,
 5.0,
 3.0,
 5.0,
 4.0,
 5.0,
 5.0,
 3.0,
 5.0,
 5.0,
 5.0,
 3.0,
 1.0,
 3.0,
 3.0,
 5.0,
 5.0,
 5.0,
 5.0,
 1.0,
 5.0,
 5.0,
 4.0,
 4.0,
 5.0,
 1.0,
 3.0,
 1.0,
 5.0,
 5.0,
 2.0,
 2.0,
 5.0,
 3.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.0,
 5.0,
 2.0,
 5.0,
 3.0,
 1.0,
 5.0,
 1.0,
 1.0,
 1.0,
 5.0,
 5.0,
 5.0,
 2.0,
 3.0,
 4.0,
 5.0,
 4.0,
 1.0,
 4.0,
 4.0,
 2.0,
 5.0,
 3.0,
 4.0,
 1.0,
 3.0,
 5.0,
 3.0,
 5.0,
 3.0,
 4.0,
 5.0,
 5.0,
 1.0,
 5.0,
 1.0,
 4.0,
 5.0,
 1.0,
 4.0,
 4.0,
 3.0,
 5.0,
 5.0,
 4.0,
 5.0,
 4.0,
 5.0,
 5.0,
 4.0,
 5.0,
 4.0,
 1.0,
 5.0,
 3.0,
 5.0,
 4.0,
 5.0,
 3.0,
 2.0,
 3.0,
 5.0,
 4.0,
 5.0,
 2.0,
 4.0,
 4.0,
 4.0,
 5.0,
 1.0,
 2.0,
 5.0,
 1.0,
 1.0,
 2.0,
 5.0,
 4.0,
 2.0,
 5.0,
 4.0,
 2.0,
 4.0,
 1.0,
 1.0,
 4.0,
 4.0,
 5.0,
 1.0,
 4.0,
 5.0,
 4.0,
 1.0,
 5.0,
 2.0,
 5.0,
 5.0,
 5.0,
 5.0,
 1.0,
 4.0,
 5.0,
 4.0,
 4.0,
 5.0,
 5.0,
 1.0,
 5.0,
 2.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.0,
 5.0,
 4.0,
 5.0,
 4.0,
 4.0,
 5.0

In [37]:
# Run this cell to save final_predictions to a csv file

with open('../data/predictions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Predictions"])
    for p in final_predictions:
        writer.writerow([p])