In [39]:
#movie rec using collaborative data and matrix factorization
import csv
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from scipy import optimize
from sklearn.decomposition import TruncatedSVD
from sklearn.cross_validation import train_test_split

#some code from http://alexabate.github.io/2016/11/05/movie-lens.html
#some code from https://nikhilwins.wordpress.com/2015/09/18/movie-recommendations-how-does-netflix-do-it-a-9-step-coding-intuitive-guide-into-collaborative-filtering/

user_item_ratings = pd.read_csv('ratings.csv', sep=',')
#user_item_ratings.head()
number_of_unique_users = len(user_item_ratings['userId'].unique())
number_of_unique_movies = len(user_item_ratings['movieId'].unique())
number_of_ratings = len(user_item_ratings)
#print user_item_ratings
print number_of_unique_users
print number_of_unique_movies

ratingsMatrix=pd.pivot_table(user_item_ratings, values='rating', fill_value=0, columns='userId', index='movieId')#
#print ratingsMatrix.head

ratingsMatrix2=ratingsMatrix.iloc[:5000,:]

print ratingsMatrix2.shape

did_rate = (ratingsMatrix2 != 0) * 1


#n_movies=number_of_unique_movies
n_movies=ratingsMatrix2.shape[0]
n_users=number_of_unique_users


#normalize_ratings from https://nikhilwins.wordpress.com/2015/09/18/movie-recommendations-how-does-netflix-do-it-a-9-step-coding-intuitive-guide-into-collaborative-filtering/
def normalize_ratings(ratings, did_rate):
    num_movies = ratings.shape[0]
    ratings_mean = np.zeros(shape = (num_movies, 1))
    ratings_norm = np.zeros(shape = ratings.shape)
    
    for i in range(0, num_movies):
        # Get all the indexes where there is a 1
        idx = np.where(did_rate.iloc[i,:] ==1)[0]
        # Calculate mean rating of ith movie only from users that gave a rating
        ratings_mean[i] = np.mean(ratings.iloc[i, idx])
         #ratings_mean[i]
        ratings_norm[i, idx] = ratings.iloc[i, idx] - ratings_mean[i]
        #(EKC: make sure these indices aren't off by one)
        #print i
    return (ratings_norm, ratings_mean)

ratings_norm, ratings_mean = normalize_ratings(ratingsMatrix2, did_rate)

did_rate=did_rate.as_matrix()

svd = TruncatedSVD(n_components=3, n_iter=10, random_state=42)
X_tr = svd.fit_transform(ratings_norm)
print X_tr.shape

X_o = svd.inverse_transform(X_tr)
#that would be the predicted ratings
difference = X_o*did_rate-ratings_norm
cost = np.sum((difference)**2)/2
print cost

671
9066
(5000, 671)
(5000, 3)
29637.9322878


In [40]:
ratings_test=ratingsMatrix.iloc[5000:,:]
print ratings_test.shape
did_rate_test= (ratings_test != 0) * 1
ratings_test_norm, test_mean=normalize_ratings(ratings_test, did_rate_test)
did_rate_test=did_rate_test.as_matrix()

print ratings_test_norm.shape
X_t_test=svd.transform(ratings_test_norm)
#this transforms the ratings_test_norm matrix using the model fit to ratings_norm
X_o_test = svd.inverse_transform(X_t_test)
#that would be the predicted ratings
difference_test = X_o_test*did_rate_test-ratings_test_norm
cost_test = np.sum((difference_test)**2)/2
print cost_test



(4066, 671)
(4066, 671)
5611.34021819


In [51]:
# can do cross-validation using different random splits of the data to train and test the model 
# train and test different models on the SAME splits for most direct model comparison
# (e.g., to see whether adding features is helpful)
n_iter=1000
cost_train=np.zeros(shape = (n_iter, 1))
cost_test=np.zeros(shape = (n_iter, 1))
cost2_train=np.zeros(shape = (n_iter, 1))
cost2_test=np.zeros(shape = (n_iter, 1))

test_cost_diffs=np.zeros(shape = (n_iter, 1))



did_rate=(ratingsMatrix != 0) * 1

ratings_norm, ratings_mean = normalize_ratings(ratingsMatrix, did_rate)

did_rate=did_rate.as_matrix()

for i in range(0,n_iter):
    traindata, testdata, did_rate_train, did_rate_test = train_test_split(ratings_norm, did_rate, test_size=0.5)
    svd = TruncatedSVD(n_components=3, n_iter=10, random_state=42)
    X_tr = svd.fit_transform(traindata)
    
    X_o = svd.inverse_transform(X_tr)
    #that would be the predicted ratings
    difference = X_o*did_rate_train-traindata
    cost_train[i] = np.sum((difference)**2)/2
    
    X_t_test=svd.transform(testdata)
    #this transforms the ratings_test_norm matrix using the model fit to ratings_norm
    X_o_test = svd.inverse_transform(X_t_test)
    #that would be the predicted ratings
    difference_test = X_o_test*did_rate_test-testdata
    cost_test[i] = (np.sum((difference_test)**2)/2)    
    
    svd2 = TruncatedSVD(n_components=4, n_iter=10, random_state=42)
    X2_tr = svd2.fit_transform(traindata)
    
    X2_o = svd2.inverse_transform(X2_tr)
    #that would be the predicted ratings
    difference2 = X2_o*did_rate_train-traindata
    cost2_train[i] = np.sum((difference2)**2)/2
    
    X2_t_test=svd2.transform(testdata)
    #this transforms the ratings_test_norm matrix using the model fit to ratings_norm
    X2_o_test = svd2.inverse_transform(X2_t_test)
    #that would be the predicted ratings
    difference2_test = X2_o_test*did_rate_test-testdata
    cost2_test[i] = np.sum((difference2_test)**2)/2
    
    
    test_cost_diffs[i]=cost2_test[i]-cost_test[i]
    #this is a measure of how much better the second model is


#(should probably rewrite to save all the test/train splits for testing other models)

In [53]:
n, bins, patches=plt.hist(test_cost_diffs, 10)
plt.xlabel('Cost difference')
plt.ylabel('Frequency')
plt.title('Model comparison')
#plt.text(60, .025, r'$\mu=100,\ \sigma=15$')
plt.axis([-500, -200, 0, 500])
plt.grid(True)
plt.show()

[[-435.8477832 ]
 [-388.42043078]
 [-368.7507882 ]
 [-419.32171996]
 [-407.69260283]
 [-402.05867992]
 [-398.15633128]
 [-425.92122591]
 [-390.84861837]
 [-359.97395756]
 [-416.30141893]
 [-392.17500656]
 [-442.26922102]
 [-383.69323408]
 [-438.32464565]
 [-442.73210042]
 [-418.23800985]
 [-371.78929363]
 [-420.61797945]
 [-422.89823273]
 [-367.49801518]
 [-397.56726353]
 [-426.8612925 ]
 [-354.4091959 ]
 [-468.02442752]
 [-425.08863937]
 [-406.94742992]
 [-388.23638091]
 [-438.51770251]
 [-419.983247  ]
 [-397.21092158]
 [-413.92576491]
 [-447.85884275]
 [-427.39003468]
 [-419.49069455]
 [-423.1983878 ]
 [-392.79093365]
 [-405.97336599]
 [-381.40350079]
 [-394.08623951]
 [-396.29144228]
 [-435.99055715]
 [-457.17357444]
 [-443.93348443]
 [-449.6723652 ]
 [-353.76239241]
 [-400.28486027]
 [-403.81071084]
 [-401.08142958]
 [-405.14843546]]
