### First Step: Imports


In [2]:
import gzip
from collections import defaultdict
import random
import numpy
import scipy.optimize
import string
from sklearn import linear_model
from nltk.stem.porter import PorterStemmer # Stemming

# 1: Data Processing

### Read the data and Fill the dataset

In [6]:
path = "../amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz"
f = gzip.open(path , "rt", encoding="utf8")
header = f.readline()
header = header.strip().split("\t")
dataset = []
for line in f:
    fields = line.strip().split("\t")
    d = dict(zip(header, fields))
    d["star_rating"] = int(d["star_rating"])
    d["helpful_votes"] = int(d["helpful_votes"])
    d["total_votes"] = int(d["total_votes"])
    dataset.append(d)
d['verified_purchase'] = d['verified_purchase'] == 'Y' 
dataset[0]


{'marketplace': 'US',
 'customer_id': '21269168',
 'review_id': 'RSH1OZ87OYK92',
 'product_id': 'B013PURRZW',
 'product_parent': '603406193',
 'product_title': 'Madden NFL 16 - Xbox One Digital Code',
 'product_category': 'Digital_Video_Games',
 'star_rating': 2,
 'helpful_votes': 2,
 'total_votes': 3,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'A slight improvement from last year.',
 'review_body': "I keep buying madden every year hoping they get back to football. This years version is a little better than last years -- but that's not saying much.The game looks great. The only thing wrong with the animation, is the way the players are always tripping on each other.<br /><br />The gameplay is still slowed down by the bloated pre-play controls. What used to take two buttons is now a giant PITA to get done before an opponent snaps the ball or the play clock runs out.<br /><br />The turbo button is back, but the player movement is still slow and awkward. If you liked las

###  Split the data into a Training and Testing set

First shuffle the data, then split the data. Have Training be the first 80%, and testing be the remaining 20%. 

In [7]:
random.shuffle(dataset)
N = len(dataset) #splitting %80 %20
trainingSet = dataset[:N*8//10]
testSet = dataset[N*8//10:]
print(len(trainingSet), len(testSet))

116344 29087


#### Now delete the dataset
I don't want any of your answers to come from my original dataset any longer, but rather my Training Set, this will help me to not make any mistakes later on, especialy when referencing the checkpoint solutions.

In [8]:
del dataset

### Extracting Basic Statistics

Some questions 
1. How many entries are in the dataset?
2. Pick a non-trivial attribute (i.e. verified purchases in example), what percentage of your the has this atttribute?
3. Pick another different non-trivial attribute, what percentage of the data share both attributes?

# 2: Classification

Next I will use our knowledge of classification to extract features and make predictions based on them. Here I will be using a Logistic Regression Model.

###  Define the feature function

This implementation will be based on ___any two___ attributes from the dataset. I will be using these two attributes to predict a third.

In [21]:
def feature(d):
    feat = [1, d['star_rating'], (d['helpful_votes'])]
    return feat

### Fit the model

1. Create the __Feature Vector__ based on your feature function defined above. 
2. Create the __Label Vector__ based on the "verified purchase" column of your training set.
3. Define the model as a __Logistic Regression__ model.
4. Fit the model.

In [27]:
X = [feature(d) for d in trainingSet]
Xt = [feature(d) for d in testSet]
y = [d["star_rating"] for d in trainingSet]
yt = [d["star_rating"] for d in testSet]
model = linear_model.LogisticRegression()
model.fit(X,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Compute Accuracy of The Model

1. Make __Predictions__ based on the model.
2. Compute the __Accuracy__ of the model.

In [32]:
pred = model.predict(Xt)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from math import sqrt
RMSE = sqrt(mean_squared_error(y_true=yt, y_pred=pred))
print(RMSE)

0.3920603014247218


# 3: Regression

In this section I will start by working though two examples of altering features to further differentiate. Then I will work through how to evaluate a Regularaized model.

In [33]:
#CHANGE PATH
path = "../amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz"

f = gzip.open(path, 'rt', encoding="utf8")
header = f.readline()
header = header.strip().split('\t')
reg_dataset = []
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    reg_dataset.append(d)

###  Unique Words in a Sample Set

I am going to work with a new dataset here, as such I am going to take a smaller portion of the set and call it a Sample Set. This is because stemming on the normal training set will take a very long time. 

1. Count the number of unique words found within the 'review body' portion of the sample set defined below, making sure to __Ignore Punctuation and Capitalization__.
2. Count the number of unique words found within the 'review body' portion of the sample set defined below, this time with use of __Stemming,__ __Ignoring Puctuation,__ ___and___ __Capitalization__.

In [34]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)

wordCountStem = defaultdict(int)
stemmer = PorterStemmer() #use stemmer.stem(stuff)

#SampleSet and y vector
sampleSet = reg_dataset[:2*len(reg_dataset)//10]
y_reg = [d['star_rating'] for d in sampleSet]

In [42]:
for d in sampleSet:
    r = "".join([c for c in d["review_body"].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1
        
for d in sampleSet:
    r = "".join([c for c in d["review_body"].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCountStem[w] += 1
print (len(wordCount), len(wordCountStem))

30020 37682


### Evaluating Classifiers

1. Given the feature function and the counts vector, __Define__ your X_reg vector. (This being the X vector, simply labeled for the Regression model)
2. __Fit__ the model using a __Ridge Model__ with (alpha = 1.0, fit_intercept = True).
3. Using the model, __Make your Predictions__.
4. Find the __MSE__ between the predictions and the y_reg vector.

In [43]:
def feature_reg(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review_body'].lower() if not c in punctuation])
    for w in r.split():
        if w in wordSet:
            feat[wordId[w]] += 1
    return feat

def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

#Note: increasing the size of the dictionary may require a lot of memory
words = [x[1] for x in counts[:100]]

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [46]:
X = [feature_reg(d) for d in sampleSet]
y = [d["star_rating"] for d in sampleSet]
model = linear_model.Ridge(1.0, fit_intercept=False) #1.0 regularization strength like lambda
model.fit(X,y)
predictions = model.predict(X)
differences = [(x-y)**2 for (x,y) in zip(predictions,y)]
MSE = sum(differences)/len(differences)
print ("MSE = " + str(MSE))

MSE = 11.17543397025844


# 4: Recommendation Systems

Finally, I will use simple similarity-based recommender systems to make calculate the most similar items.

In [47]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)

### Fill the Dictionaries

1. For each entry in the training set, fill the default dictionaries. 

In [49]:
itemNames = {}
for d in trainingSet:
    user,item = d["customer_id"], d["product_id"]
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    itemNames[item] = d["product_title"]


In [50]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

def mostSimilar(n, m): #n is the entry index
    similarities = []  #m is the number of entries
    users = attribute_1[n]
    for i2 in attribute_1:
        if i2 == n: continue
        sim = Jaccard(users, attribute_1[n])
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:m]

###  Finding the top 10 and the most similar game!

In [55]:
print(mostSimilar(trainingSet[1]["product_id"]))
print ("The most similar product is :")
print (itemNames[trainingSet[1]["product_id"]])

[(0.019801980198019802, 'B009W68BL8'), (0.012345679012345678, 'B00HI4ZAWY'), (0.012345679012345678, 'B00FQMVGDM'), (0.012345679012345678, 'B00FONYKRM'), (0.012345679012345678, 'B008ELNDDS'), (0.012345679012345678, 'B004OEIQAE'), (0.012345679012345678, 'B001L5TISS'), (0.012195121951219513, 'B00PHXNVHY'), (0.012195121951219513, 'B00JUTJYFE'), (0.012195121951219513, 'B00EJ1DCDS')]
The most similar product is :
Amnesia [Download]
