# HW4: implementing item-based CF with cosine
First, run recommenderDemo.ipynb and be familar with the code and data.
Second, implement item-based CF with cosine

In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy as np
import random
import pandas as pd

1. load the data, and convert integer-valued fields as we go. Note that here we use the same "Musical Instruments" dataset. Download the date from here: https://web.cs.wpi.edu/~kmlee/cs547/amazon_reviews_us_Musical_Instruments_v1_00_small.tsv.gz
The dataset contains 20K user-item reviews.

In [2]:
# From https://web.cs.wpi.edu/~kmlee/cs547/amazon_reviews_us_Musical_Instruments_v1_00_small.tsv.gz
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
path = "./amazon_reviews_us_Musical_Instruments_v1_00_small.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

In [3]:
header = f.readline()
header = header.strip().split('\t')

In [4]:
print(header)

['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']


In [5]:
dataset = []

In [6]:
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    d['customer_id']=str(d['customer_id'])
    d['product_id']=str(d['product_id'])
    dataset.append(d)

In [7]:
len(dataset)

20000

2. now store the loaded data into a matrix -- you may use numpy array/matrix to store the untility matrix or use sparse matrix (advanced approach)

In [8]:
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
df = pd.DataFrame(dataset)# Storing in dataframe
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,45610553,RMDCHWD0Y5OZ9,B00HH62VB6,618218723,AGPtek® 10 Isolated Output 9V 12V 18V Guitar P...,Musical Instruments,3,0,1,N,N,Three Stars,"Works very good, but induces ALOT of noise.",2015-08-31
1,US,14640079,RZSL0BALIYUNU,B003LRN53I,986692292,Sennheiser HD203 Closed-Back DJ Headphones,Musical Instruments,5,0,0,N,Y,Five Stars,Nice headphones at a reasonable price.,2015-08-31
2,US,6111003,RIZR67JKUDBI0,B0006VMBHI,603261968,AudioQuest LP record clean brush,Musical Instruments,3,0,1,N,Y,Three Stars,removes dust. does not clean,2015-08-31
3,US,1546619,R27HL570VNL85F,B002B55TRG,575084461,Hohner Inc. 560BX-BF Special Twenty Harmonica,Musical Instruments,5,0,0,N,Y,I purchase these for a friend in return for pl...,I purchase these for a friend in return for pl...,2015-08-31
4,US,12222213,R34EBU9QDWJ1GD,B00N1YPXW2,165236328,Blue Yeti USB Microphone - Blackout Edition,Musical Instruments,5,0,0,N,Y,Five Stars,This is an awesome mic!,2015-08-31


In [9]:
print(np.unique(df['star_rating']))

[1 2 3 4 5]


In [10]:
matrix = df.pivot_table(index='product_id', columns='customer_id', values='star_rating').fillna(0)

In [11]:
matrix.loc['B00HH62VB6','45610553']

3.0

In [12]:
matrix.shape

(11182, 15342)

In [13]:
# meatrix = matrix.subtract(matrix.mean(axis=1), axis = 0)
# matrix.head()

In [14]:
import sys
sys.getsizeof(matrix)/1000000

1373.447394

In [15]:
# matrix=matrix.to_dict('records')

In [16]:
usersPerItem = defaultdict(set)

In [8]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
for d in dataset:
    user,item = d['customer_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

3. Implement cosine function and rating prediction function by using the cosine function. If a hasn't rated any similar items before, then return ratingMean (i.e., global rating mean). Refer to predictRating() in hw4jaccard.ipynb

In [18]:
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary

    

In [9]:
def Cosine(s1,s2):
    s1=np.array(s1)
    s2=np.array(s2)
    for i in range(len(s1)):
        if s1[i]!=0:
            s1[i]=s1[i]-np.mean(s1)
        if s2[i]!=0:
              s2[i]=s2[i]-np.mean(s2)
              
    return np.round(np.dot(s1,s2)/(np.sqrt((s1*s1).sum())*np.sqrt((s2*s2).sum())),4)


ratingMean = sum([d['star_rating'] for d in dataset]) / len(dataset)

def predictRatingCosine(user,item):
    #print(i)
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(matrix.loc[i2,user])
        similarities.append(Cosine(matrix.loc[i2].values,matrix.loc[item].values))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean


    
    
    

In [23]:
labels = [d['star_rating'] for d in dataset]

4. Measure and report MSE (don't need to change the below code)

In [11]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

cfPredictions = [predictRatingCosine(d['customer_id'], d['product_id']) for d in dataset]
print(MSE(cfPredictions, labels))

NameError: name 'matrix' is not defined

In [25]:
alwaysPredictMean = [ratingMean for d in dataset]

In [26]:
MSE(alwaysPredictMean, labels)

1.4803161599999557

(optional/bonus task: you will get additional 25 points) 
download https://web.cs.wpi.edu/~kmlee/cs547/amazon_reviews_us_Musical_Instruments_v1_00_large.tsv.gz
this dataset contains over 900K user-item reviews. repeat the above process (i.e., meauring MSE with cosine). report the MSE and compare it with MSE of alwaysPredictMean. This optional task would require better data structure and implementation.

In [16]:
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
path = "./amazon_reviews_us_Musical_Instruments_v1_00_large.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

header = f.readline()
header = header.strip().split('\t')

In [17]:
dataset = []
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    d['customer_id']=str(d['customer_id'])
    d['product_id']=str(d['product_id'])
    dataset.append(d)
    
df = pd.DataFrame(dataset)# Storing in dataframe
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,45610553,RMDCHWD0Y5OZ9,B00HH62VB6,618218723,AGPtek® 10 Isolated Output 9V 12V 18V Guitar P...,Musical Instruments,3,0,1,N,N,Three Stars,"Works very good, but induces ALOT of noise.",2015-08-31
1,US,14640079,RZSL0BALIYUNU,B003LRN53I,986692292,Sennheiser HD203 Closed-Back DJ Headphones,Musical Instruments,5,0,0,N,Y,Five Stars,Nice headphones at a reasonable price.,2015-08-31
2,US,6111003,RIZR67JKUDBI0,B0006VMBHI,603261968,AudioQuest LP record clean brush,Musical Instruments,3,0,1,N,Y,Three Stars,removes dust. does not clean,2015-08-31
3,US,1546619,R27HL570VNL85F,B002B55TRG,575084461,Hohner Inc. 560BX-BF Special Twenty Harmonica,Musical Instruments,5,0,0,N,Y,I purchase these for a friend in return for pl...,I purchase these for a friend in return for pl...,2015-08-31
4,US,12222213,R34EBU9QDWJ1GD,B00N1YPXW2,165236328,Blue Yeti USB Microphone - Blackout Edition,Musical Instruments,5,0,0,N,Y,Five Stars,This is an awesome mic!,2015-08-31


In [18]:
#matrix = df.pivot_table(index='product_id', columns='customer_id', values='star_rating').fillna(0)
matrix= sparse.coo_matrix((data, (row, col)), shape=(4, 4))

NameError: name 'sparse' is not defined

In [19]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
for d in dataset:
    user,item = d['customer_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    
def Cosine(s1,s2):
    s1=np.array(s1)
    s2=np.array(s2)
    for i in range(len(s1)):
        if s1[i]!=0:
            s1[i]=s1[i]-np.mean(s1)
        if s2[i]!=0:
              s2[i]=s2[i]-np.mean(s2)
              
    return np.round(np.dot(s1,s2)/(np.sqrt((s1*s1).sum())*np.sqrt((s2*s2).sum())),4)


ratingMean = sum([d['star_rating'] for d in dataset]) / len(dataset)

def predictRatingCosine(user,item):
    #print(i)
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(matrix.loc[i2,user])
        similarities.append(Cosine(matrix.loc[i2].values,matrix.loc[item].values))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

labels = [d['star_rating'] for d in dataset]

def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

cfPredictions = [predictRatingCosine(d['customer_id'], d['product_id']) for d in dataset]
print(MSE(cfPredictions, labels))

NameError: name 'matrix' is not defined

*-----------------
# Done

All set! 

** What do you need to submit?**

* **hw4.ipynb Notebook File**: Save this Jupyter notebook with all output, and find the notebook file in your folder (for example, "filename.ipynb"). This is the file you need to submit. 

** How to submit: **
        Please submit through canvas.wpi.edu
