## Colaborative based filtering

In [8]:
import pandas as pd #data_manipulation
import numpy as np #for numerical operations

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go #these are for visulization
import matplotlib.pyplot as plt

from tqdm import tqdm #progress bars

import warnings
warnings.filterwarnings('ignore')
from surprise import Dataset, Reader #for building and evaluating recommendation models.

from surprise.prediction_algorithms.matrix_factorization import SVD

from surprise import accuracy


In [9]:
ratings = pd.read_csv("ratings_small.csv")

In [10]:
movie_md = pd.read_csv("movies_metadata.csv")

In [11]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


#### We will consider ratings for movies which have more than 55 counts.

In [12]:
# movie dataframe with votes more than 55
movie_md = movie_md[movie_md['vote_count']>55][['id','title']]

# IDs of movies with count more than 55
movie_ids = [int(x) for x in movie_md['id'].values]

# Select ratings of movies with more than 55 counts
ratings = ratings[ratings['movieId'].isin(movie_ids)]

# Reset Index
ratings.reset_index(inplace=True, drop=True)

# Print first 5 rows
ratings.head()


#movies with more than 55 votes are selected and corredsponding ratings are filtered based on these movies.


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1371,2.5,1260759135
1,1,2105,4.0,1260759139
2,1,2294,2.0,1260759108
3,2,17,5.0,835355681
4,2,62,3.0,835355749


In [13]:
ratings.shape


(29965, 4)

In [1]:
# Initialize a surprise reader object
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)

# Load the data
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)

# Build trainset object(perform this only when you are using whole dataset to train)
trainset = data.build_full_trainset()


NameError: name 'Reader' is not defined

In [16]:
# Initialize model
svd = SVD()

# cross-validate
svd.fit(trainset)
#The surprise library is used to build a matrix factorization model for collaborative filtering .
#the data is loaded into a surprise datset and a trainset is constructed.

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17bccee90>

### we have fit the data successfully,now lets check some prediction 

In [17]:
svd.predict(uid=3,iid=2959,r_ui=5.0)


Prediction(uid=3, iid=2959, r_ui=5.0, est=4.241495545543834, details={'was_impossible': False})

In [18]:
svd.predict(uid=15,iid=2678,r_ui=1.0)


Prediction(uid=15, iid=2678, r_ui=1.0, est=2.7757018048396542, details={'was_impossible': False})

Prediction(uid=15, iid=2678, r_ui=1.0, est=2.7676212104701854, details={'was_impossible': False})
When are using .predict() method where we are passing 3 arguments,i.e. userID(uid), itemID(iid) and r_ui(true rating)

The output of each prediction is a tuple where est is our estimated ratings.

We can see that our model is able to perform good and is able to make good predictions. However, this model can be further improved by using hyperparameter optimization techinique.

Now that our models is ready we will be filling the user-item interaction matrix and will be making recommendations.



In [19]:
def get_recommendations(data, movie_md, user_id, top_n, algo):
    
    # creating an empty list to store the recommended product ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_movie_interactions_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    
    # extracting those product ids which the user_id has not interacted yet
    non_interacted_movies = user_movie_interactions_matrix.loc[user_id][user_movie_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # looping through each of the product ids which user_id has not interacted yet
    for item_id in non_interacted_movies:
        
        # predicting the ratings for those non interacted product ids by this user
        est = algo.predict(user_id, item_id).est
        
        # appending the predicted ratings
        movie_name = movie_md[movie_md['id']==str(item_id)]['title'].values[0]
        recommendations.append((movie_name, est))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_n] # returing top n highest predicted rating products for this user


In [20]:
get_recommendations(data=ratings,movie_md=movie_md, user_id=654, top_n=10, algo=svd)


[('Dawn of the Dead', 5),
 ('Nell', 4.9634099682540604),
 ('The Sixth Sense', 4.95521615934213),
 ('The Thomas Crown Affair', 4.9446899893288965),
 ('While You Were Sleeping', 4.924757808467973),
 ('Ghost Rider', 4.890412459302687),
 ('Galaxy Quest', 4.886163957696125),
 ('Frankenstein', 4.874370086088085),
 ('Flags of Our Fathers', 4.852233741485755),
 ('Hard Target', 4.848599413852251)]

## User based

In [21]:
from surprise.prediction_algorithms.knns import KNNBasic

In [22]:
#Declaring the similarity options.
sim_options = {'name': 'cosine',
               'user_based': True}

# KNN algorithm is used to find similar items
sim_user = KNNBasic(sim_options=sim_options, verbose=False, random_state=33)

# Train the algorithm on the trainset, and predict ratings for the testset
sim_user.fit(trainset)


<surprise.prediction_algorithms.knns.KNNBasic at 0x1797c7310>

In [23]:
#predicting rating for a sample user with an interacted product.
sim_user.predict(uid=2,iid=17,r_ui=5.0)

Prediction(uid=2, iid=17, r_ui=5.0, est=4.166335018545322, details={'actual_k': 40, 'was_impossible': False})

In [24]:
#predicting rating for a sample user with an interacted product.
sim_user.predict(uid=671,iid=4011,r_ui=4.0)


Prediction(uid=671, iid=4011, r_ui=4.0, est=4.262454431125302, details={'actual_k': 40, 'was_impossible': False})

In [25]:
get_recommendations(ratings, movie_md, 671,10,sim_user)


[('The Wizard', 5),
 ('Rio Bravo', 5),
 ('The Celebration', 5),
 ('Spider-Man 3', 5),
 ('A Streetcar Named Desire', 5),
 ('Gentlemen Prefer Blondes', 5),
 ('The Evil Dead', 5),
 ('JFK', 5),
 ('Strangers on a Train', 5),
 ("Singin' in the Rain", 5)]

## Item based

In [26]:
#Declaring the similarity options.
sim_options = {'name': 'cosine',
               'item_based': False}

# KNN algorithm is used to find similar items
sim_item = KNNBasic(sim_options=sim_options, verbose=False, random_state=33)

# Train the algorithm on the trainset, and predict ratings for the testset
sim_item.fit(trainset)


<surprise.prediction_algorithms.knns.KNNBasic at 0x17bcacd10>

In [27]:
#predicting rating for a sample user with an interacted product.
sim_item.predict(uid=2,iid=17,r_ui=5.0)


Prediction(uid=2, iid=17, r_ui=5.0, est=4.166335018545322, details={'actual_k': 40, 'was_impossible': False})

In [28]:
#predicting rating for a sample user with an interacted product.
sim_item.predict(uid=671,iid=4011,r_ui=4.0)


Prediction(uid=671, iid=4011, r_ui=4.0, est=4.262454431125302, details={'actual_k': 40, 'was_impossible': False})

In [29]:
get_recommendations(ratings, movie_md, 671,10,sim_item)


[('The Wizard', 5),
 ('Rio Bravo', 5),
 ('The Celebration', 5),
 ('Spider-Man 3', 5),
 ('A Streetcar Named Desire', 5),
 ('Gentlemen Prefer Blondes', 5),
 ('The Evil Dead', 5),
 ('JFK', 5),
 ('Strangers on a Train', 5),
 ("Singin' in the Rain", 5)]