<a href="https://colab.research.google.com/github/grgupta13/machine-learning/blob/main/Netflix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("/content/drive/MyDrive/netflix/combined_data_1.txt.zip", header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
#not using time, only using two columns
df.head()

In [None]:
df.dtypes

In [None]:
df['Rating'] = df['Rating'].astype(float)# to change the data type, although it is already in float--not needed, just to show how to change data type

In [None]:
df.shape

In [None]:
stars = df.groupby('Rating')['Rating'].agg(['count'])
stars.head()
# movies are rated by more than one customer, as number of movies are 4499

In [None]:
# calculating the number of movies, as movie id column contains nan values, so it will give the movie count
movie_count = df.isna().sum().sum()
movie_count


In [None]:
# to count the unique customer we have
# Cust_Id column contains the movie id(1:,...), so to count the unique customer we will subtract these movie count
cust_count = df['Cust_Id'].nunique() - df['Rating'].isna().sum() # 475257 - 4499
cust_count

In [None]:
# total number of rating given by customer, ignore nan values
rating_count = df['Cust_Id'].count() - movie_count 

In [None]:
# plot bar graph for ratings
ax = stars.plot(kind='barh', legend = False, figsize = (15,10))
plt.title(f"total movies:{movie_count}, total customers: {cust_count}, total ratings given: {rating_count}")
plt.grid(True)

In [None]:
# we want to take out the movie id from cust_id column, if cust_id column contains the movie id then its rating is NaN
# we want to seperate out those cust_id which has rating NaN
null_ = df.isna()['Rating'] # this will contain the dataframe which has rating column values in True or False
null_ = null_[null_ == True]#extracting all ratings which are True, i.e. null values
# resetting the index and making it a new column
null_ = null_.reset_index()
null_.head()

In [None]:
movie_ids =df.iloc[null_["index"]]["Cust_Id"].str.replace(":",'') # now by selecting the Cust_Id, we can get the movie id
movie_ids.reset_index(drop=True, inplace = True) # reset and drop index
movie_ids.head()

In [None]:
# we want to add movie_id column into df
# here from null_ dataframe, we see that index of df 0 to 547 are for the movie 1, similarly index from 548 to 693 are for movie2 and so on
# we are interested in getting the range of the indices for df from null_ dataframe "index" column
# we can do it by selecting the index from 0th row and 1st row in tuple or zip them, similarly from 1st row and 2nd row in tuple
# (null_["index"][1:], null_["index"][:-1])
movie_np=[]
movie_id=1
for i, j in zip(null_['index'][1:], null_['index'][:-1]):
    temp=np.full((1, i-j), movie_id)
    movie_np=np.append(movie_np, temp)
    movie_id+=1

# the above loop does not handle the last value 
#account for last record and corresponding length
#numpy approach
temp = np.full((1, len(df) - null_["index"].iloc[-1] ), movie_id)
movie_np = np.append(movie_np, temp)
print(f"movie_np:{movie_np}")
print(f"length of movie_np: {len(movie_np)}")

In [None]:
i = 548
j = 0 
movie_id = 1
temp = np.full((1, i-j-1), movie_id)
print(temp)
print(temp.shape)

In [None]:
# since the movie id is related to index column of df, therefore we will first assign the movie_np to df as new column
# then we can drop null values
df["movie_id"] = movie_np
len_b4_drop = len(df)
df.dropna(inplace = True)
print(f"df length after dropping null values:{ len(df)}")

In [None]:
# changing data type
df["movie_id"] = df["movie_id"].astype(int)
df["Cust_Id"] = df["Cust_Id"].astype(int)
df.info()

In [None]:
# now we can find the number of customers and average rating for each movie using groupby on movie_id
movie_rating_summary = df.groupby("movie_id")['Rating'].agg(['count', 'mean'])
movie_rating_summary
# the avg rating for movie 1 is 3.74 and rated by 547 customers
# similary movie 2 is rated by 145 customers and 3.55 ratings
# and so on..

In [None]:
movie_rating_summary.index=movie_rating_summary.index.map(int)

In [None]:
# we can also see the count of star rating given for different movie
# movie 1 is rated for 5 star by 145 customers 
df.groupby(["movie_id", 'Rating']).agg(['count'])


In [None]:
# let's calculate the 70% quantile value for the count column of movie_rating_summary dataframe
# benchmark is number of customers who has watched movie more than 70% of the time
benchmark = round(movie_rating_summary["count"].quantile(0.7),0)
benchmark


In [None]:
# now we select those movies for which count is greater than benchmark for the movie to qualify to consider for recommendation
# the becnhmark criteria helps us in filtering those movies which are watched more, i.e. more popular
movie_rating_summary[movie_rating_summary["count"] > 1799]
# total 1348 movies qualifies, and we will build recommendation system for these movies only

In [None]:
# we are selecting those movies  which do not satisfy the benchmark critera
dropped_movies = movie_rating_summary[movie_rating_summary["count"] < benchmark].index
dropped_movies

In [None]:
cust_rating_summary = df.groupby("Cust_Id")[["Rating"]].agg("count")
cust_rating_summary # index is Cust_Id here

In [None]:
# similarly, we will set the benchmark criteria for customers who has voted 70% of the time
cust_benchmark = round(cust_rating_summary["Rating"].quantile(0.7),0)
cust_benchmark

In [None]:
# we will consider only those customers who has given rating atleast 52 times
dropped_customers = cust_rating_summary[cust_rating_summary["Rating"] < 52.0].index
dropped_customers


In [None]:
# now we will remove those entries from df which contains any movieid in dropped_movies and any customers in dropped_customers
df = df[~df["movie_id"].isin(dropped_movies)]
df = df[~df["Cust_Id"].isin(dropped_customers)]
print('After the triming, the shape is: {}'.format(df.shape))

In [None]:
df.head()

In [None]:
df_p = pd.pivot(df, values = 'Rating', index = 'Cust_Id', columns = 'movie_id')
df_p.shape

In [None]:
df_p.head()

In [None]:
# load movie titles
movie_titles = pd.read_csv("/content/drive/MyDrive/netflix/movie_titles.csv",
                           encoding ="ISO-8859-1",on_bad_lines='skip', names=['movie_id','year','title'], header = None)
movie_titles.head()

In [None]:
# selecting the movie_id as index
movie_titles.set_index("movie_id", inplace = True)

In [None]:
movie_titles.head()

In [None]:
!pip install scikit-surprise

In [None]:
import math
import re
from scipy.sparse import csr_matrix
import seaborn as sns
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

In [None]:
# reader
reader = Reader()

In [None]:
# load dataframe into Dataset Object
data = Dataset.load_from_df(df[['Cust_Id','movie_id','Rating']][:100000], reader)



In [None]:
svd = SVD()
cross_validate(svd,data, measures=['RMSE','MAE'], verbose = True, cv=3 )

In [None]:
trainset=data.build_full_trainset()


In [None]:

svd.fit(trainset)

In [None]:
# recommend movies for customer id 712664
df_712664 = df[(df['Cust_Id'] == 712664) & (df['Rating'] == 5)]
df_712664.set_index('movie_id', inplace = True)
df_712664.head()

In [None]:
# we will join df_712664 with movie_titles df on movie_id which is the index column in both df
df_712664 = df_712664.join(movie_titles)[['title','Rating']]
df_712664

In [None]:
user_712664 = movie_titles.copy()
user_712664.reset_index(inplace = True)
user_712664


In [None]:
# we will drop the movie that do not satify the benchmark criteria
user_712664 = user_712664[~(user_712664['movie_id'].isin(dropped_movies))]
user_712664

In [None]:
user_712664['estimate_score'] = user_712664['movie_id'].apply(lambda x: svd.predict(712664, x).est)
user_712664

In [None]:
user_712664 = user_712664.sort_values('estimate_score', ascending = False)
user_712664

In [None]:
# top 10movies recommended bys sytem for user 712664
user_712664.sort_values('movie_id').head(10)

In [None]:
df_712664.reset_index(inplace = True)

In [None]:

user_712664[user_712664['movie_id'].isin(df_712664['movie_id'])].sort_values('movie_id')

In [None]:
df_712664['movie_id']