# Advanced Reccomendations

## Data Preprocessing

In [78]:
import pandas as pd
import numpy as np

In [79]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
movie_rating = pd.read_csv('u.data', sep='\t', names=columns)

In [80]:
movie_rating.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [81]:
movies = pd.read_csv('Movie_Id_Titles', sep=',', index_col='item_id')

In [82]:
df = movie_rating.merge(movies, on='item_id')

In [83]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [84]:
n_title = df['item_id'].nunique()

In [85]:
n_user = df['user_id'].nunique()

In [86]:
df.shape

(100003, 5)

# Model Preparation

## Train Test Split

In [87]:
from sklearn.model_selection import train_test_split

In [88]:
train_data, test_data = train_test_split(df, test_size=.25)

In [89]:
train_data.shape, test_data.shape

((75002, 5), (25001, 5))

In [90]:
train_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
25665,5,443,4,875720744,"Birds, The (1963)"
61713,642,120,3,886206256,Striptease (1996)
53007,287,168,5,875335190,Monty Python and the Holy Grail (1974)
8870,328,432,1,885047511,Fantasia (1940)
3795,747,274,4,888733348,Sabrina (1995)


## Create a user and item matrices

In [91]:
train_data_matrix = np.zeros((n_user, n_title))
print(n_user, n_title)

944 1682


In [92]:
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

In [96]:
train_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [97]:
test_data.shape

(25001, 5)

In [101]:
test_data_matrix = np.zeros((n_user, n_title))

for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [104]:
from sklearn.metrics import pairwise_distances

In [106]:
user_similarity = pairwise_distances(train_data_matrix, metric = 'cosine')

In [112]:
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

## Predictions

In [115]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [120]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [123]:
user_prediction.shape

(944, 1682)

In [118]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [119]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.124226159127901
Item-based CF RMSE: 3.4503345491737814


In [136]:
test_data_matrix.nonzero()

(array([  0,   0,   0, ..., 942, 942, 943]),
 array([   0,    1,    2, ..., 1066, 1329,  171]))