In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.sparse import csr_matrix, save_npz
from sklearn.neighbors import NearestNeighbors
import joblib
from scipy.sparse import load_npz

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
movie_dataset = pd.read_csv("/content/drive/MyDrive/4cinephile/dataset/movie.csv")
rating_dataset = pd.read_csv("/content/drive/MyDrive/4cinephile/dataset/rating.csv")

In [None]:
movie_dataset.head()

In [None]:
rating_dataset.head()

Exploratory Data Analysis

In [None]:
number_ratings = len(rating_dataset)
number_movies = len(movie_dataset)
number_of_unique_users = rating_dataset['userId'].nunique()
number_of_unique_movies = rating_dataset['movieId'].nunique()

print("Number of rows in movies dataset = {}".format(number_movies))
print("Number of rows in ratings dataset = {}".format(number_ratings))
print("Number of unique users in ratings dataset = {}".format(number_of_unique_users))
print("Number of unique movies in ratings dataset = {}".format(number_of_unique_movies))

In [None]:
sns.countplot(x='rating',data=rating_dataset)
plt.title("Movie ratings")
plt.show()

In [None]:
mean_ratings_per_user = rating_dataset.groupby('userId')['rating'].mean()
print(mean_ratings_per_user)

total_mean_ratings = mean_ratings_per_user.mean()
print("Overall mean rating : ",total_mean_ratings)

In [None]:
#Find mostly frequently rated movie
movie_ratings = pd.merge(movie_dataset,rating_dataset,on='movieId')
print(movie_ratings.head())

In [None]:
mean_ratings = movie_ratings.groupby('movieId')[['rating']].mean()
print("The least rated movies are : ")
lowest_rated = mean_ratings['rating'].idxmin()
print(movie_ratings[movie_ratings['movieId']==lowest_rated])

print("The highly rated movies are : ")
highest_rated = mean_ratings['rating'].idxmax()
print(movie_ratings[movie_ratings['movieId']==highest_rated])



If we observe properly then Sonic Outlaws has highest rating, but the issue is there are only 3 ratings for that movie.

Consider a case : only rated once, but it is rated 5
Other case : A movie is rated 1000 times and everytime it is rated 4.That means that it is more popular, but it is recommended as it's rating is low.
But actually, it's more viewed so we should be recommended that.

Thus, there is issue we need ratings with respect to number of ratings

To solve this issue, we use
### Bayesian Average

bayesian(ratings):
Bayesian Average = ( C * m + (ratings.mean()))/ (C + ratings.count())


In [None]:
movie_stats = movie_ratings.groupby('movieId')['rating'].agg(['count','mean'])
print(movie_stats.head())

In [None]:
C = movie_stats['count'].mean()
m = movie_stats['mean'].mean()

def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return round(bayesian_avg, 3)

In [None]:
bayesian_rating_dataset = rating_dataset.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_rating_dataset.columns = ['movieId','bayesian_avg']
movie_stats = pd.merge(movie_stats,bayesian_rating_dataset,on='movieId')
print(movie_stats.head())

In [None]:
movie_stats = movie_stats.merge(movie_dataset[['movieId', 'title']])
print(movie_stats.sort_values('bayesian_avg', ascending=False).head())
print("--------------------------")
print(movie_stats.sort_values('bayesian_avg', ascending=True).head())

Separate out the genres in form of list. If we observe carefully, in dataset, it is in form of | separated.

In [None]:
movie_dataset['genres'] = movie_dataset['genres'].apply(lambda x : x.split('|'))
movie_dataset.head()

In [None]:
from collections import Counter

genre_frequency = Counter(g for genres in movie_dataset['genres'] for g in genres)
print(genre_frequency.most_common(5))

In [None]:
df_genre = pd.DataFrame([genre_frequency]).T.reset_index()
df_genre.columns = ['genre', 'count']

sns.barplot(x='genre', y='count', data=df_genre.sort_values(by='count', ascending=False))
plt.xticks(rotation=90)
plt.show()

Building utility matrix

---



---

USERS - ROWS <br>
MOVIES - COLUMNS

In [None]:
def create_utility_matrix(df):
  #Consider m x n matrix, m users and n movies
  m = df['userId'].nunique()
  n = df['movieId'].nunique()
  user_index = dict(zip(np.unique(df["userId"]), list(range(m))))
  index_user = dict(zip(list(range(m)), np.unique(df["userId"])))
  movie_index = dict(zip(np.unique(df["movieId"]), list(range(n))))
  index_movie = dict(zip(list(range(n)), np.unique(df["movieId"])))
  user_index_fromdict = [user_index[i] for i in df['userId']]
  movie_index_fromdict = [movie_index[i] for i in df['movieId']]
  mat = csr_matrix((df["rating"], (user_index_fromdict,movie_index_fromdict)), shape=(m,n))
  return mat,user_index,index_user,movie_index,index_movie

mat,user_index,index_user,movie_index,index_movie = create_utility_matrix(rating_dataset)
mat.shape

In [None]:
mat

In [None]:
totalCells = mat.shape[0]*mat.shape[1]
rated = mat.nnz
sparsity = rated/totalCells
#Shows stored_cells/Total_cells
print("Sparsity is : ",sparsity)

In [None]:
n_ratings_per_user = mat.getnnz(axis=1)
len(n_ratings_per_user)

In [None]:
n_ratings_per_movie = mat.getnnz(axis=0)
len(n_ratings_per_movie)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
sns.kdeplot(n_ratings_per_user, shade=True)
plt.xlim(0)
plt.title("Number of Ratings Per User", fontsize=14)
plt.xlabel("number of ratings per user")
plt.ylabel("density")
plt.subplot(1,2,2)
sns.kdeplot(n_ratings_per_movie, shade=True)
plt.xlim(0)
plt.title("Number of Ratings  Per Movie", fontsize=14)
plt.xlabel("number of ratings per movie")
plt.ylabel("density")
plt.show()

## **Model Export and Load**

In [None]:

def find_similar_movies(movie_id, mat , movie_index, index_movie, k=10, metric='cosine'):
    mat = mat.T
    nearest_k_ids = []

    movieIdx = movie_index[movie_id]
    movie_vec = mat[movieIdx]
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(mat)
    joblib.dump(kNN,'./Recommendation_System.sav')
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        nearest_k_ids.append(index_movie[n])
    nearest_k_ids.pop(0)
    return nearest_k_ids

In [None]:
movie_dataset.to_parquet('./movie_database.parquet',engine='fastparquet',index=False)
save_npz('utility_matrix.npz', mat)
joblib.dump(user_index,'./user_index.pkl')
joblib.dump(index_user,'./index_user.pkl')
joblib.dump(movie_index,'./movie_index.pkl')
joblib.dump(index_movie,'./index_movie.pkl')

In [None]:
movie_dataset = pd.read_parquet('./movie_database.parquet')
mat = load_npz('./utility_matrix.npz')
user_index = joblib.load('./user_index.pkl')
index_user = joblib.load('./index_user.pkl')
movie_index = joblib.load('./movie_index.pkl')
index_movie = joblib.load('./index_movie.pkl')
model = joblib.load('./Recommendation_System.sav')
movie_title = 'Toy Story (1995)'

In [None]:
def get_recommendations(movie_title,model, mat, k=11):
    titleofmovie = dict(zip(movie_dataset['title'], movie_dataset['movieId']))
    titleofmovie_new = dict(zip(movie_dataset['movieId'], movie_dataset['title']))
    movie_id = titleofmovie[movie_title]
    mat = mat.T
    nearest_k_ids = []
    nearest_movie_name = []
    movieIdx = movie_index[movie_id]
    movie_vec = mat[movieIdx]
    neighbour = model.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        nearest_k_ids.append(index_movie[n])
        nearest_movie_name.append(titleofmovie_new[index_movie[n]])
    nearest_k_ids.pop(0)
    nearest_movie_name.pop(0)
    return nearest_movie_name

In [None]:
get_recommendations(movie_title,model,mat)