In [1]:
import pandas as pd
import os
import numpy as np

from collections import defaultdict
from collections import Counter

from surprise import Dataset, evaluate
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import Reader
from surprise import accuracy,SVD

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
import warnings

warnings.filterwarnings('ignore')

### Data Prep

In [2]:
movie_df=pd.read_csv('.\ml-1m\movies.dat', sep='::', skiprows=0, encoding='latin-1',names = ["MovieID", "MovieName", "Genres"])
movie_df_genres = movie_df[['MovieID','Genres']]
movie_split = movie_df_genres['Genres'].str.split("|")
movie_split.index = movie_df.MovieID
movie_dict = movie_split.to_dict()

In [3]:
rating_df=pd.read_csv('./ml-1m/ratings.dat', sep='::', skiprows=0, encoding='latin-1',names = ["UserID", "MovieID", "Rating","Timestamp"])

In [4]:
rating_df = rating_df[["UserID", "MovieID", "Rating"]]
rating_df = rating_df[:50000]

In [5]:
### remove 20% data for userid = 1 to test the accuracy 
#actuals = rating_df[rating_df.UserID == 1]
rows = rating_df.sample(n = 500)

rating_df = rating_df[~(rating_df.MovieID.isin(rows.MovieID) & rating_df.UserID.isin(rows.UserID))]

In [6]:
#REDUCING RATINGS DATASET 
data = Dataset.load_from_df(rating_df,reader=Reader(rating_scale=(1,5)))

### Model Training and Testing

In [7]:
trainSet = data.build_full_trainset()

In [8]:
trainItems = trainSet.ur.items()

In [9]:
trainItems_dict = dict()
for k,v in trainItems:
    trainItems_dict[k]=v

In [10]:
#trainItems_dict #this contains "internal ids" as part of surprise package

In [11]:
user_profile = dict()
for k,v in trainItems_dict.items():
    k = trainSet.to_raw_uid(k) #Getting raw ids
    user_profile[k] = dict()
    temp_genre_list = []
    for i in v:
        temp_genre_list = movie_dict.get(trainSet.to_raw_iid(i[0])) #Converting to raw id for lookup
        for j in temp_genre_list:
            if j in user_profile[k].keys():
                user_profile[k][j] = (i[1]+user_profile[k][j])/2
            else:
                user_profile[k][j] = i[1]

In [15]:
options = {'name':'pearson', 'user_based': False}
knn_model = KNNBasic(sim_options=options)

In [16]:
knn_model.train(trainSet)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x242c0231c18>

In [17]:
testset = trainSet.build_anti_testset()

In [18]:
predict = knn_model.test(testset)

### Preparing Result Set

In [20]:
predicted_dict = dict()
for i in predict:
    if i[0] in predicted_dict.keys():
        predicted_dict[i[0]].update({i[1]:i[3]})
    else:
        predicted_dict[i[0]] = dict()
        predicted_dict[i[0]] = {i[1]:i[3]}

In [21]:
def get_genre_score(uid,mid):
    """
    Input: UserID uid and MovieID mid
    Output: Genre score of that users predicted movie
    Note:Passes mid to movie_dict and returns list of generes in that movie. Averages the genre values of the returned genres
    from users' genre based profile
    """
    temp_avg = 0
    for i in movie_dict.get(mid,'NA'):  #adding condition to check if movie exists in movie_dict
        if i != 'NA':
            temp_avg = (temp_avg+user_profile[uid].get(i,temp_avg))/2   
    return temp_avg

In [22]:
#test_user1 = 1
adjusted_predict = list()
for test_user1 in user_profile.keys():
    for k,v in predicted_dict[test_user1].items():
        adjusted_predict.append([test_user1,k,v,((0.8*v)+(0.2*get_genre_score(test_user1,k))),0])


In [23]:
def get_top3_recs(predict, N = 3):
    top3_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predict:
        top3_recs[uid].append((iid, est))
     
    for uid, user_ratings in top3_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top3_recs[uid] = user_ratings[:N]
     
    return top3_recs

In [24]:
movie_recommendations = get_top3_recs(predict) #Without User Profiles

In [25]:
movie_recommendations_profiled = get_top3_recs(adjusted_predict) #With User Profiles

### Validation and Results

Method 1 : using Surprise package methods

In [27]:
algo = SVD()
accuracy.rmse(adjusted_predict)

algo.fit(trainSet)
svd_predictions = algo.test(testset)

RMSE: 0.3386


In [28]:
accuracy.rmse(svd_predictions)

RMSE: 0.4996


0.49963746220394617

Method 2 : using manually created Validation dataframe

In [29]:
validation_df = pd.DataFrame(data = adjusted_predict)
validation_df.columns = ['UserID','MovieID','Original_estimation','New_estimation','Null']
validation_df = validation_df[['UserID','MovieID','New_estimation']]
validation_df.columns = ['UserID','MovieID','Rating']

In [30]:
same_records = validation_df.merge(rows, how="inner",left_on = ["MovieID","UserID"], right_on = ["MovieID","UserID"])

In [41]:
original_validation_df = pd.DataFrame(data = predict)
original_validation_df.columns = ['UserID','MovieID','Original','New_estimation','Null']
original_validation_df = original_validation_df[['UserID','MovieID','New_estimation']]
original_validation_df.columns = ['UserID','MovieID','Rating']

In [32]:
original_validation_df = original_validation_df.merge(rows, how="inner",left_on = ["MovieID","UserID"], right_on = ["MovieID","UserID"])

In [33]:
#### original ###
rms = sqrt(mean_squared_error(original_validation_df.Rating_y, original_validation_df.Rating_x))
mae = mean_absolute_error(original_validation_df.Rating_y, original_validation_df.Rating_x)
print(rms,mae)


1.157924779730421 0.92397273206342


In [34]:
## user profile ##
rms = sqrt(mean_squared_error(same_records.Rating_y, same_records.Rating_x))
mae = mean_absolute_error(same_records.Rating_y, same_records.Rating_x)
print(rms,mae)

1.1306007812831638 0.9173148622382078


Results

In [36]:
def movie_print(u,m):
    c = Counter(user_profile[u]).most_common(3)
    print("Users preferred genre are ",c)
    for i in m:
        print("Top recommended movie", movie_df[movie_df.MovieID == i[0]])

In [37]:
movie_print(4,movie_recommendations[4])

Users preferred genre are  [('Drama', 5.0), ('Western', 5.0), ('Action', 4.625)]
Top recommended movie       MovieID         MovieName  Genres
2722     2791  Airplane! (1980)  Comedy
Top recommended movie       MovieID                  MovieName         Genres
1250     1270  Back to the Future (1985)  Comedy|Sci-Fi
Top recommended movie       MovieID       MovieName         Genres
1672     1721  Titanic (1997)  Drama|Romance


In [38]:
movie_print(4,movie_recommendations_profiled[4])

Users preferred genre are  [('Drama', 5.0), ('Western', 5.0), ('Action', 4.625)]
Top recommended movie      MovieID                  MovieName                   Genres
586      590  Dances with Wolves (1990)  Adventure|Drama|Western
Top recommended movie      MovieID      MovieName                        Genres
588      592  Batman (1989)  Action|Adventure|Crime|Drama
Top recommended movie      MovieID                       MovieName                    Genres
300      303  Quick and the Dead, The (1995)  Action|Adventure|Western


In [39]:
rating_df[rating_df.UserID==3]

Unnamed: 0,UserID,MovieID,Rating
182,3,3421,4
183,3,1641,2
184,3,648,3
185,3,1394,4
186,3,3534,3
187,3,104,4
188,3,2735,4
189,3,1210,4
190,3,1431,3
191,3,3868,3
