### Collaborative filtering for rating prediction

- Given a ser and an item, can we predict what rating the user will give to that item?
- Simple Heuristic: 
    - The user (u)'s rating for and item i is a weighted combination of all of their previous ratings for items j 
    - The weight for each rating is given by the Jaccard similarity between i and j
- In short: (how the user rate every other item that have rated j and those ratings are weighted by how similar those items are to the given query)

In [1]:
import gzip
from collections import defaultdict
import random
import numpy
import scipy.optimize

In [2]:
path = "/Users/cnogueira/Documents/Development/Notebooks/study/week2/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"

In [3]:
f = gzip.open(path, 'rt', encoding="utf8")
header = f.readline()
header = header.strip().split('\t')

In [4]:
dataset = []
for line in f: 
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

In [5]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
for d in dataset:
    user, item = d['customer_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [6]:
# This is just to see if the algorithm is better to just predict the mean in terms of mean square error all the time
ratingMean = sum([d['star_rating'] for d in dataset])/len(dataset)

In [7]:
ratingMean

4.251102772543146

In [36]:
usersPerItem = defaultdict(set) #Ui
itemsPerUser = defaultdict(set) #Iu
itemNames = {}
for d in dataset:
    user, item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    itemNames[item] = d['product_title']

In [37]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer/denom

In [38]:
# Prediction code
def predictRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'])
        similarities.append(Jaccard(usersPerItem[item], usersPerItem[i2]))
    if (sum(similarities) > 0):
        # List comprehension weight*Jaccard for each item in the two lists
        weightedRatings = [(x*y) for x, y in zip(ratings,similarities)]
        # Get the average
        return sum(weightedRatings)/sum(similarities)
    else:
        # User hasn't rated any similar items (default case)
        return ratingMean

In [39]:
dataset[1]

{'marketplace': 'US',
 'customer_id': '14640079',
 'review_id': 'RZSL0BALIYUNU',
 'product_id': 'B003LRN53I',
 'product_parent': '986692292',
 'product_title': 'Sennheiser HD203 Closed-Back DJ Headphones',
 'product_category': 'Musical Instruments',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Five Stars',
 'review_body': 'Nice headphones at a reasonable price.',
 'review_date': '2015-08-31'}

In [40]:
u, i = dataset[1]['customer_id'], dataset[1]['product_id']

In [41]:
predictRating(u, i)

5.0

#### Accuracy

In [42]:
# mean square error
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions, labels)]
    return sum(differences)/len(differences)

In [43]:
# baseline
alwaysPredictMean = [ratingMean for d in dataset]

In [44]:
cfPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataset]

In [45]:
labels = [d['star_rating'] for d in dataset]

In [46]:
MSE(alwaysPredictMean, labels)

1.4796142779564334

In [47]:
# The model did worst in mean square error. I can use other similarity (ex: cosine) or usersxitens, instead of itensxusers
MSE(cfPredictions, labels)

1.6146130004291603