# **Collaborative Filtering Using Non-Negative Matrix Factorization**

****we'll create 3 dataframes ratingMatrix_df1, ratingMatrix_df2 and ratingMatrix****

In [1]:
import numpy as np
import pandas as pd

# **1. CREATE: ratingMatrix_df1**

In [3]:
# preprocessing netflix dataset

movie = pd.read_csv('netflix-movie-rating-dataset/Netflix_Dataset_Movie.csv')
rating = pd.read_csv('netflix-movie-rating-dataset/Netflix_Dataset_Rating.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'netflix-movie-rating-dataset/Netflix_Dataset_Movie.csv'

In [None]:
movie

In [None]:
# nothing to remove in rating df
rating

In [None]:
# remove movies which are released < 2000 and make computations easier & practical
print('shape of movie df BEFORE removing redundant movies: ', movie.shape)

In [None]:
movie = movie[movie['Year'] >= 2000]
print('shape of movie df AFTER removing redundant movies: ', movie.shape)

In [None]:
movie

In [None]:
# merge movie & rating df to form ratingMatrix_df1

ratingMatrix_df1 = pd.merge(movie, rating)

In [None]:
ratingMatrix_df1

In [None]:
# drop 'Movie_ID' & 'Year'

ratingMatrix_df1 = ratingMatrix_df1[['User_ID', 'Name', 'Rating']]
ratingMatrix_df1

In [None]:
# rename 'Name' -> 'Movie_ID'

ratingMatrix_df1 = ratingMatrix_df1.rename(columns={'Name': 'Movie_ID'})
ratingMatrix_df1

In [None]:
# drop duplicate 'Movie_ID'

ratingMatrix_df1.drop_duplicates(subset ="Movie_ID",keep = 'first', inplace = True)

In [None]:
ratingMatrix_df1

In [None]:
# serialize 'User_ID'

ratingMatrix_df1['User_ID'] = np.arange(len(ratingMatrix_df1))
ratingMatrix_df1

In [None]:
print("shape of ratingMatrix_df1: ", ratingMatrix_df1.shape)

# **2. CREATE: ratingMatrix_df2**

In [None]:
# preprocessing tmdb dataset

tmdb_credit = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
tmdb_movie = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

In [None]:
tmdb_movie

In [None]:
tmdb_credit

In [None]:
# merge tmdb_movie & tmdb_credit df to form ratingMatrix_df2

ratingMatrix_df2 = pd.merge(tmdb_movie, tmdb_credit)

In [None]:
ratingMatrix_df2.head(3)

In [None]:
ratingMatrix_df2.shape

In [None]:
# only keep 'movie_id' and 'title'

ratingMatrix_df2 = ratingMatrix_df2[['movie_id', 'title']]

In [None]:
ratingMatrix_df2

In [None]:
# change 'movie_id' -> 'User_ID' 
# change 'title -> 'Movie_ID'

ratingMatrix_df2 = ratingMatrix_df2.rename(columns={'title': 'Movie_ID'})
ratingMatrix_df2 = ratingMatrix_df2.rename(columns={'movie_id': 'User_ID'})

In [None]:
ratingMatrix_df2

In [None]:
# drop duplicates from 'Movie_ID'

ratingMatrix_df2.drop_duplicates(subset ="Movie_ID",keep = 'first', inplace = True)

In [None]:
ratingMatrix_df2

In [None]:
# serialize 'User_ID'

ratingMatrix_df2['User_ID'] = np.arange(len(ratingMatrix_df2))
ratingMatrix_df2

In [None]:
print("shape of ratingMatrix_df2: ", ratingMatrix_df2.shape)

# ****3. Review of ratingMatrix_df1 & ratingMatrix_df2****

In [None]:
ratingMatrix_df1

In [None]:
ratingMatrix_df2

In [None]:
print("shape of ratingMatrix_df1: ", ratingMatrix_df1.shape)
print("shape of ratingMatrix_df2: ", ratingMatrix_df2.shape)

# **4. merge ratingMatrix_df1 & ratingMatrix_df2 to form ratingMatrix_df**

In [None]:
ratingMatrix_df = pd.merge(ratingMatrix_df1, ratingMatrix_df2, how='outer')

In [None]:
# fill all NaN 'Rating' values with a dummy value 1

ratingMatrix_df.fillna(value=1, inplace=True)
ratingMatrix_df

In [None]:
ratingMatrix_df = pd.pivot_table(ratingMatrix_df, index='User_ID', columns='Movie_ID')

In [None]:
ratingMatrix_df

In [None]:
print("shape of ratingMatrix_df: ", ratingMatrix_df.shape)

# **5. Make ratingMatrix_df sparse**

In [None]:
# use loc and iloc to select rows/cols
# ratingMatrix_df.iloc[3][3] = 3
# df -> numpy array & vice versa
# array = ratingMatrix_df.to_numpy()
# array[0][1] = 1
# sample_df = pd.DataFrame(array)

In [None]:
# create a matrix of same shape as ratingMatrix_df of range -> [-18, 6]

m = np.random.randint(low=-18, high=6, size=(4800, 5060), dtype=int)

In [None]:
# lets check the format of ratingMatrix_df's columns

ratingMatrix_df.columns

In [None]:
# create a list containing all movie names

movieList=[]
for i in ratingMatrix_df.columns:
    movieList.append(i[1])

In [None]:
# store the values in ratingMatrix_df

ratingMatrix_df = pd.DataFrame(m, columns=movieList)

In [None]:
# make it sparse
ratingMatrix_df[ratingMatrix_df < 1] = 0

In [None]:
ratingMatrix_df