# **Collaborative Filtering Using Non-Negative Matrix Factorization**

****we'll create 3 dataframes ratingMatrix_df1, ratingMatrix_df2 and ratingMatrix****

In [1]:
import numpy as np
import pandas as pd

# **1. CREATE: ratingMatrix_df1**

In [2]:
# preprocessing netflix dataset

rating = pd.read_csv('Netflix_Dataset_Rating.csv')
movie = pd.read_csv('Netflix_Dataset_Movie.csv')

In [3]:
movie

Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [4]:
# nothing to remove in rating df
rating

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3
1,1331154,4,3
2,2632461,3,3
3,44937,5,3
4,656399,4,3
...,...,...,...
17337453,520675,3,4496
17337454,1055714,5,4496
17337455,2643029,4,4496
17337456,1559566,3,4496


In [5]:
# remove movies which are released < 2000 and make computations easier & practical
print('shape of movie df BEFORE removing redundant movies: ', movie.shape)

shape of movie df BEFORE removing redundant movies:  (17770, 3)


In [6]:
movie = movie[movie['Year'] >= 2000]
print('shape of movie df AFTER removing redundant movies: ', movie.shape)

shape of movie df AFTER removing redundant movies:  (6947, 3)


In [7]:
movie

Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
4,5,2004,The Rise and Fall of ECW
7,8,2004,What the #$*! Do We Know!?
9,10,2001,Fighter
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [8]:
# merge movie & rating df to form ratingMatrix_df1

ratingMatrix_df1 = pd.merge(movie, rating)

In [9]:
ratingMatrix_df1

Unnamed: 0,Movie_ID,Year,Name,User_ID,Rating
0,8,2004,What the #$*! Do We Know!?,824097,2
1,8,2004,What the #$*! Do We Know!?,785314,1
2,8,2004,What the #$*! Do We Know!?,243963,3
3,8,2004,What the #$*! Do We Know!?,1447783,4
4,8,2004,What the #$*! Do We Know!?,1912665,1
...,...,...,...,...,...
7817187,4493,2003,Ju-on: The Grudge,666284,4
7817188,4493,2003,Ju-on: The Grudge,617970,4
7817189,4493,2003,Ju-on: The Grudge,2398437,2
7817190,4493,2003,Ju-on: The Grudge,1362728,3


In [10]:
# drop 'Movie_ID' & 'Year'

ratingMatrix_df1 = ratingMatrix_df1[['User_ID', 'Name', 'Rating']]
ratingMatrix_df1

Unnamed: 0,User_ID,Name,Rating
0,824097,What the #$*! Do We Know!?,2
1,785314,What the #$*! Do We Know!?,1
2,243963,What the #$*! Do We Know!?,3
3,1447783,What the #$*! Do We Know!?,4
4,1912665,What the #$*! Do We Know!?,1
...,...,...,...
7817187,666284,Ju-on: The Grudge,4
7817188,617970,Ju-on: The Grudge,4
7817189,2398437,Ju-on: The Grudge,2
7817190,1362728,Ju-on: The Grudge,3


In [11]:
# rename 'Name' -> 'Movie_ID'

ratingMatrix_df1 = ratingMatrix_df1.rename(columns={'Name': 'Movie_ID'})
ratingMatrix_df1

Unnamed: 0,User_ID,Movie_ID,Rating
0,824097,What the #$*! Do We Know!?,2
1,785314,What the #$*! Do We Know!?,1
2,243963,What the #$*! Do We Know!?,3
3,1447783,What the #$*! Do We Know!?,4
4,1912665,What the #$*! Do We Know!?,1
...,...,...,...
7817187,666284,Ju-on: The Grudge,4
7817188,617970,Ju-on: The Grudge,4
7817189,2398437,Ju-on: The Grudge,2
7817190,1362728,Ju-on: The Grudge,3


In [12]:
# drop duplicate 'Movie_ID'

ratingMatrix_df1.drop_duplicates(subset ="Movie_ID",keep = 'first', inplace = True)

In [13]:
ratingMatrix_df1

Unnamed: 0,User_ID,Movie_ID,Rating
0,824097,What the #$*! Do We Know!?,2
9379,2187374,7 Seconds,4
13544,712664,Never Die Alone,3
17806,1392773,Lilo and Stitch,4
50203,2473170,Something's Gotta Give,5
...,...,...,...
7702977,185150,Love Actually,3
7769434,364518,Wonder Boys,3
7797431,998229,Ned Kelly,4
7804938,66568,Club Dread,2


In [14]:
# serialize 'User_ID'

ratingMatrix_df1['User_ID'] = np.arange(len(ratingMatrix_df1))
ratingMatrix_df1

Unnamed: 0,User_ID,Movie_ID,Rating
0,0,What the #$*! Do We Know!?,2
9379,1,7 Seconds,4
13544,2,Never Die Alone,3
17806,3,Lilo and Stitch,4
50203,4,Something's Gotta Give,5
...,...,...,...
7702977,464,Love Actually,3
7769434,465,Wonder Boys,3
7797431,466,Ned Kelly,4
7804938,467,Club Dread,2


In [15]:
print("shape of ratingMatrix_df1: ", ratingMatrix_df1.shape)

shape of ratingMatrix_df1:  (469, 3)


# **2. CREATE: ratingMatrix_df2**

In [16]:
# preprocessing tmdb dataset

tmdb_credit = pd.read_csv('tmdb_5000_credits.csv')
tmdb_movie = pd.read_csv('tmdb_5000_movies.csv')

In [46]:
tmdb_movie.shape

(4803, 20)

In [45]:
tmdb_movie.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [18]:
tmdb_credit

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...
4798,9367,El Mariachi,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
4799,72766,Newlyweds,"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
4800,231617,"Signed, Sealed, Delivered","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
4801,126186,Shanghai Calling,"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


In [19]:
# merge tmdb_movie & tmdb_credit df to form ratingMatrix_df2

ratingMatrix_df2 = pd.merge(tmdb_movie, tmdb_credit)

In [47]:
ratingMatrix_df2.head(2)

Unnamed: 0,User_ID,Movie_ID
0,0,Avatar
1,1,Pirates of the Caribbean: At World's End


In [21]:
ratingMatrix_df2.shape

(4809, 23)

In [22]:
# only keep 'movie_id' and 'title'

ratingMatrix_df2 = ratingMatrix_df2[['movie_id', 'title']]

In [23]:
ratingMatrix_df2

Unnamed: 0,movie_id,title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter
...,...,...
4804,9367,El Mariachi
4805,72766,Newlyweds
4806,231617,"Signed, Sealed, Delivered"
4807,126186,Shanghai Calling


In [24]:
# change 'movie_id' -> 'User_ID' 
# change 'title -> 'Movie_ID'

ratingMatrix_df2 = ratingMatrix_df2.rename(columns={'title': 'Movie_ID'})
ratingMatrix_df2 = ratingMatrix_df2.rename(columns={'movie_id': 'User_ID'})

In [25]:
ratingMatrix_df2

Unnamed: 0,User_ID,Movie_ID
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter
...,...,...
4804,9367,El Mariachi
4805,72766,Newlyweds
4806,231617,"Signed, Sealed, Delivered"
4807,126186,Shanghai Calling


In [26]:
# drop duplicates from 'Movie_ID'

ratingMatrix_df2.drop_duplicates(subset ="Movie_ID",keep = 'first', inplace = True)

In [27]:
ratingMatrix_df2

Unnamed: 0,User_ID,Movie_ID
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter
...,...,...
4804,9367,El Mariachi
4805,72766,Newlyweds
4806,231617,"Signed, Sealed, Delivered"
4807,126186,Shanghai Calling


In [28]:
# serialize 'User_ID'

ratingMatrix_df2['User_ID'] = np.arange(len(ratingMatrix_df2))
ratingMatrix_df2

Unnamed: 0,User_ID,Movie_ID
0,0,Avatar
1,1,Pirates of the Caribbean: At World's End
2,2,Spectre
3,3,The Dark Knight Rises
4,4,John Carter
...,...,...
4804,4795,El Mariachi
4805,4796,Newlyweds
4806,4797,"Signed, Sealed, Delivered"
4807,4798,Shanghai Calling


In [29]:
print("shape of ratingMatrix_df2: ", ratingMatrix_df2.shape)

shape of ratingMatrix_df2:  (4800, 2)


# ****3. Review of ratingMatrix_df1 & ratingMatrix_df2****

In [30]:
ratingMatrix_df1

Unnamed: 0,User_ID,Movie_ID,Rating
0,0,What the #$*! Do We Know!?,2
9379,1,7 Seconds,4
13544,2,Never Die Alone,3
17806,3,Lilo and Stitch,4
50203,4,Something's Gotta Give,5
...,...,...,...
7702977,464,Love Actually,3
7769434,465,Wonder Boys,3
7797431,466,Ned Kelly,4
7804938,467,Club Dread,2


In [31]:
ratingMatrix_df2

Unnamed: 0,User_ID,Movie_ID
0,0,Avatar
1,1,Pirates of the Caribbean: At World's End
2,2,Spectre
3,3,The Dark Knight Rises
4,4,John Carter
...,...,...
4804,4795,El Mariachi
4805,4796,Newlyweds
4806,4797,"Signed, Sealed, Delivered"
4807,4798,Shanghai Calling


In [32]:
print("shape of ratingMatrix_df1: ", ratingMatrix_df1.shape)
print("shape of ratingMatrix_df2: ", ratingMatrix_df2.shape)

shape of ratingMatrix_df1:  (469, 3)
shape of ratingMatrix_df2:  (4800, 2)


# **4. merge ratingMatrix_df1 & ratingMatrix_df2 to form ratingMatrix_df**

In [33]:
ratingMatrix_df = pd.merge(ratingMatrix_df1, ratingMatrix_df2, how='outer')

In [34]:
# fill all NaN 'Rating' values with a dummy value 1

ratingMatrix_df.fillna(value=1, inplace=True)
ratingMatrix_df

Unnamed: 0,User_ID,Movie_ID,Rating
0,0,What the #$*! Do We Know!?,2.0
1,1,7 Seconds,4.0
2,2,Never Die Alone,3.0
3,3,Lilo and Stitch,4.0
4,4,Something's Gotta Give,5.0
...,...,...,...
5264,4795,El Mariachi,1.0
5265,4796,Newlyweds,1.0
5266,4797,"Signed, Sealed, Delivered",1.0
5267,4798,Shanghai Calling,1.0


In [35]:
ratingMatrix_df = pd.pivot_table(ratingMatrix_df, index='User_ID', columns='Movie_ID')

In [36]:
ratingMatrix_df

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
Movie_ID,#Horror,(500) Days of Summer,10 Cloverfield Lane,10 Days in a Madhouse,10 Things I Hate About You,101 Dalmatians II: Patch's London Adventure,102 Dalmatians,10th & Wolf,11:14,12 Angry Men,...,Zoolander,Zoolander 2,Zoom,Zulu,[REC],[REC]²,eXistenZ,xXx,xXx: State of the Union,Æon Flux
User_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,,,,,,,,,,,...,,,,,,,,,,
4796,,,,,,,,,,,...,,,,,,,,,,
4797,,,,,,,,,,,...,,,,,,,,,,
4798,,,,,,,,,,,...,,,,,,,,,,


In [37]:
print("shape of ratingMatrix_df: ", ratingMatrix_df.shape)

shape of ratingMatrix_df:  (4800, 5060)


# **5. Make ratingMatrix_df sparse**

In [38]:
# use loc and iloc to select rows/cols
# ratingMatrix_df.iloc[3][3] = 3
# df -> numpy array & vice versa
# array = ratingMatrix_df.to_numpy()
# array[0][1] = 1
# sample_df = pd.DataFrame(array)

In [39]:
# create a matrix of same shape as ratingMatrix_df of range -> [-18, 6]

m = np.random.randint(low=-18, high=6, size=(4800, 5060), dtype=int)

In [40]:
# lets check the format of ratingMatrix_df's columns

ratingMatrix_df.columns

MultiIndex([('Rating',                                     '#Horror'),
            ('Rating',                        '(500) Days of Summer'),
            ('Rating',                         '10 Cloverfield Lane'),
            ('Rating',                       '10 Days in a Madhouse'),
            ('Rating',                  '10 Things I Hate About You'),
            ('Rating', '101 Dalmatians II: Patch's London Adventure'),
            ('Rating',                              '102 Dalmatians'),
            ('Rating',                                 '10th & Wolf'),
            ('Rating',                                       '11:14'),
            ('Rating',                                '12 Angry Men'),
            ...
            ('Rating',                                   'Zoolander'),
            ('Rating',                                 'Zoolander 2'),
            ('Rating',                                        'Zoom'),
            ('Rating',                                       

In [41]:
# create a list containing all movie names

movieList=[]
for i in ratingMatrix_df.columns:
    movieList.append(i[1])

In [42]:
# store the values in ratingMatrix_df

ratingMatrix_df = pd.DataFrame(m, columns=movieList)

In [43]:
# make it sparse
ratingMatrix_df[ratingMatrix_df < 1] = 0

In [44]:
ratingMatrix_df

Unnamed: 0,#Horror,(500) Days of Summer,10 Cloverfield Lane,10 Days in a Madhouse,10 Things I Hate About You,101 Dalmatians II: Patch's London Adventure,102 Dalmatians,10th & Wolf,11:14,12 Angry Men,...,Zoolander,Zoolander 2,Zoom,Zulu,[REC],[REC]²,eXistenZ,xXx,xXx: State of the Union,Æon Flux
0,0,0,3,0,2,0,0,0,0,0,...,0,0,0,0,0,2,0,2,1,0
1,5,0,0,0,0,3,0,0,5,0,...,0,1,0,0,0,0,0,5,0,0
2,4,0,0,0,0,0,0,0,2,4,...,0,0,3,0,0,0,0,0,0,0
3,0,3,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,2,0,0
4,0,5,0,0,0,0,0,3,0,0,...,0,0,0,1,0,0,3,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5,0
4796,0,0,4,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4797,3,1,5,0,0,0,4,0,0,0,...,0,0,0,0,0,5,0,3,0,4
4798,0,3,0,2,0,0,0,0,2,0,...,0,0,0,0,0,0,0,5,0,0
