# Item based collaborative filtering python

In [40]:
#importing neccesary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

# Step 1 - Data Preparation

In [41]:
my_path = 'moviedataset/ml-latest/'

In [74]:
#Reading movielens data from 
#https://grouplens.org/datasets/movielens/
# read ratings file
ratings_df = pd.read_csv(my_path + 'ratings.csv')

In [76]:
# see the cols
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [75]:
#Checking Data
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [44]:
ratings_df.shape

(22884377, 4)

In [45]:
my_df = ratings_df[:100000]
my_df.shape

(100000, 4)

In [46]:
#Importing movie title and joining with main data
movie_titles = pd.read_csv(my_path + 'movies.csv',encoding= 'unicode_escape')
movie_titles.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [47]:
my_df = pd.merge(my_df, movie_titles, on='movieId')
my_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,169,2.5,1204927694,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
1,13,169,1.0,974868393,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
2,14,169,3.0,845470321,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
3,17,169,1.0,944991371,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama
4,68,169,1.0,1011092044,Free Willy 2: The Adventure Home (1995),Adventure|Children|Drama


## userId - the ID of the user who rated the movie.
## movieId - the ID of the movie.
## rating - The rating the user gave the movie, between 1 and 5.
## timestamp - The time the movie was rated.
## title - The title of the movie.

# Step 2 - Data exploration

In [48]:
#Statitical features of the movie
my_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,99315.0,99315.0,99315.0,99315.0
mean,544.687288,11794.145527,3.463002,1129025000.0
std,306.837117,24834.011957,1.104703,175333600.0
min,1.0,1.0,0.5,827098400.0
25%,277.0,930.0,3.0,984895500.0
50%,563.0,2366.0,3.5,1120352000.0
75%,815.0,5377.0,4.0,1269800000.0
max,1052.0,148683.0,5.0,1453995000.0


In [49]:
#creating mean ratings data
ratings = pd.DataFrame(my_df.groupby('title')['rating'].mean())
ratings.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'Round Midnight (1986),3.0
'Til There Was You (1997),1.75
'Twas the Night Before Christmas (1974),2.0
"'burbs, The (1989)",3.176471
(500) Days of Summer (2009),3.571429


In [50]:
#creating number of ratings data
ratings['number_of_ratings'] = my_df.groupby('title')['rating'].count()
ratings.head()

Unnamed: 0_level_0,rating,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Round Midnight (1986),3.0,1
'Til There Was You (1997),1.75,4
'Twas the Night Before Christmas (1974),2.0,1
"'burbs, The (1989)",3.176471,17
(500) Days of Summer (2009),3.571429,35


In [51]:
#Plotting the jointplot
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.jointplot(x='rating', y='number_of_ratings', data=ratings)


<seaborn.axisgrid.JointGrid at 0x792541af0130>

In [52]:
duplicate_titles = my_df['title'].duplicated()
if duplicate_titles.any():
    print("Duplicate titles found:")
    print(my_df[duplicate_titles])

Duplicate titles found:
       userId  movieId  rating   timestamp  \
1          13      169     1.0   974868393   
2          14      169     3.0   845470321   
3          17      169     1.0   944991371   
4          68      169     1.0  1011092044   
5         178      169     2.5  1140216232   
...       ...      ...     ...         ...   
98975    1046     4471     3.0  1299213467   
99032    1013    31026     2.0  1172704711   
99080    1049     5516     4.0  1445715466   
99090    1046     6581     4.0  1299036507   
99096    1046    55274     2.5  1299209452   

                                             title                    genres  
1          Free Willy 2: The Adventure Home (1995)  Adventure|Children|Drama  
2          Free Willy 2: The Adventure Home (1995)  Adventure|Children|Drama  
3          Free Willy 2: The Adventure Home (1995)  Adventure|Children|Drama  
4          Free Willy 2: The Adventure Home (1995)  Adventure|Children|Drama  
5          Free Willy 2: The

In [53]:
print(my_df.columns)

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres'], dtype='object')


# Creating User - Item interaction matrix

In [22]:
# Handle missing values (if necessary)
my_df = my_df.dropna(subset=['title'])  # Remove rows with missing titles

# Create the user-item interaction matrix
movie_matrix_UII = my_df.pivot_table(index='userId', columns='title', values='rating')

In [23]:
#Most rated movies
ratings.sort_values('number_of_ratings', ascending=False).head(10)

Unnamed: 0_level_0,rating,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),4.041096,365
Pulp Fiction (1994),4.181686,344
"Shawshank Redemption, The (1994)",4.459215,331
"Silence of the Lambs, The (1991)",4.184049,326
Jurassic Park (1993),3.653169,284
"Matrix, The (1999)",4.101083,277
Star Wars: Episode IV - A New Hope (1977),4.182156,269
Toy Story (1995),3.857143,266
Braveheart (1995),4.056604,265
Schindler's List (1993),4.305328,244


# Making recommendation - Example movie- FARGO

In [24]:
#Fetching ratings for Fargo
Fargo_user_rating = movie_matrix_UII['Fargo (1996)']

In [25]:
#Finding the correlation with different movies
similar_to_fargo=movie_matrix_UII.corrwith(Fargo_user_rating)

In [26]:
#Observing the result
similar_to_fargo.head()

title
'Round Midnight (1986)                          NaN
'Til There Was You (1997)                  0.507093
'Twas the Night Before Christmas (1974)         NaN
'burbs, The (1989)                         0.187663
(500) Days of Summer (2009)                0.662222
dtype: float64

# creating a threshold for minimum number of ratings

In [27]:
#creating dataframe to bring in #of ratings
corr_fargo = pd.DataFrame(similar_to_fargo, columns=['Correlation'])
corr_fargo.dropna(inplace=True)
corr_fargo.head()

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
'Til There Was You (1997),0.507093
"'burbs, The (1989)",0.187663
(500) Days of Summer (2009),0.662222
*batteries not included (1987),-0.174078
...And Justice for All (1979),1.0


In [28]:
#Bringining in ratings
corr_fargo = corr_fargo.join(ratings['number_of_ratings'])

corr_fargo.head()

Unnamed: 0_level_0,Correlation,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),0.507093,4
"'burbs, The (1989)",0.187663,17
(500) Days of Summer (2009),0.662222,35
*batteries not included (1987),-0.174078,10
...And Justice for All (1979),1.0,7


In [29]:
corr_fargo[corr_fargo['number_of_ratings'] > 30].sort_values(by='Correlation', ascending=False).head(10)

Unnamed: 0_level_0,Correlation,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Fargo (1996),1.0,210
Django Unchained (2012),0.780125,45
"Craft, The (1996)",0.718575,42
Hoop Dreams (1994),0.689473,53
Moon (2009),0.675468,31
My Left Foot (1989),0.66481,34
(500) Days of Summer (2009),0.662222,35
Rushmore (1998),0.651871,65
"African Queen, The (1951)",0.650443,40
This Is Spinal Tap (1984),0.643938,75


# Next - User based recommendation - On Demand

In [55]:
# Create user-item matrix
user_item_matrix = my_df.pivot_table(index='userId', columns='movieId', values='rating')


In [56]:
# Replace missing values with 0
user_item_matrix.fillna(0, inplace=True)


In [57]:
# Convert to sparse matrix
sparse_matrix = csr_matrix(user_item_matrix)


In [67]:
# Define a function to calculate similarity between users
def calculate_similarity(user1, user2):
    user1_array = user_item_matrix.loc[user1].values.reshape(1, -1)
    user2_array = user_item_matrix.loc[user2].values.reshape(1, -1)
    return cosine_similarity(user1_array, user2_array)[0][0]


In [68]:
# Define a function to get top-N recommendations for a user
def get_recommendations(user_id, N=10):
    # Find similar users
    similar_users = []
    for other_user in user_item_matrix.index:
        if other_user != user_id:
            similarity = calculate_similarity(user_id, other_user)
            similar_users.append((other_user, similarity))
    
    # Sort similar users by similarity
    similar_users.sort(key=lambda x: x[1], reverse=True)
    
    # Get top-N similar users
    top_similar_users = similar_users[:N]
    
    # Get movies rated by top-N similar users but not by the target user
    recommended_movies = []
    for similar_user, _ in top_similar_users:
        movies_rated_by_similar_user = user_item_matrix.columns[user_item_matrix.loc[similar_user] > 0]
        movies_not_rated_by_target_user = [movie for movie in movies_rated_by_similar_user if user_item_matrix.loc[user_id, movie] == 0]
        recommended_movies.extend(movies_not_rated_by_target_user)
    
    # Remove duplicates and sort by rating frequency
    recommended_movies = list(set(recommended_movies))
    recommended_movies.sort(key=lambda x: user_item_matrix[x].sum(), reverse=True)
    
    # Return top-N recommended movies
    return recommended_movies[:N]


In [69]:
# Test the recommendation system
user_id = 5  # replace with the desired user ID
recommended_movies = get_recommendations(user_id)
print("Recommended movies for user", user_id, ":", recommended_movies)


Recommended movies for user 5 : [318, 356, 296, 593, 260, 110, 527, 480, 1, 50]


In [73]:
# To get movie titles instead of IDs
recommended_movie_titles = my_df.loc[my_df['movieId'].isin(recommended_movies)]['title']
print("Recommended movie titles for user", user_id, ":", recommended_movie_titles)

Recommended movie titles for user 5 : 513      Forrest Gump (1994)
514      Forrest Gump (1994)
515      Forrest Gump (1994)
516      Forrest Gump (1994)
517      Forrest Gump (1994)
                ...         
35275       Toy Story (1995)
35276       Toy Story (1995)
35277       Toy Story (1995)
35278       Toy Story (1995)
35279       Toy Story (1995)
Name: title, Length: 2924, dtype: object
