# Movie Recommender

http://nbviewer.jupyter.org/github/khanhnamle1994/movielens/blob/master/Content_Based_and_Collaborative_Filtering_Models.ipynb

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

%matplotlib inline

In [2]:
sns.set_style('whitegrid')
sns.set_context('talk')

In [3]:
ratings_df = pd.read_csv('../../Demos/data/ml-1m/ratings.dat',
                         sep='::',
                         names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
                         skipinitialspace=True)

users_df = pd.read_csv('../../Demos/data/ml-1m/users.dat',
                       sep='::',
                       names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
                       skipinitialspace=True)

movies_df = pd.read_csv('../../Demos/data/ml-1m/movies.dat',
                        sep='::',
                        names=['MovieID', 'Title', 'Genres'],
                        skipinitialspace=True)

print("Rating_DF")
print("---------")
print("Shape = {}".format(ratings_df.shape))
display(ratings_df.head(10))
print('\n')

print("Users_DF")
print("--------")
print("Shape = {}".format(users_df.shape))
display(users_df.head(10))
print('\n')

print("Movies_DF")
print("---------")
print("Shape = {}".format(movies_df.shape))
display(movies_df.head(10))

  after removing the cwd from sys.path.


Rating_DF
---------
Shape = (1000209, 4)


  if __name__ == '__main__':
  


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368




Users_DF
--------
Shape = (6040, 5)


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,6810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370




Movies_DF
---------
Shape = (3883, 3)


Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
# Added new columns and fixed Genres
movies_df['Year'] = movies_df['Title'].apply(lambda x: x[x.rfind('(') + 1: x.rfind(')')])
movies_df['Title'] = movies_df['Title'].apply(lambda x: x[:-6].strip())
movies_df['Genres'] = movies_df['Genres'].str.split('|')  # Break up the big genre string into a string array
movies_df['Genres'] = movies_df['Genres'].fillna('').astype('str')  # Convert genres to string value

print(movies_df.shape)
display(movies_df.head(10))

(3883, 4)


Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story,"['Animation', ""Children's"", 'Comedy']",1995
1,2,Jumanji,"['Adventure', ""Children's"", 'Fantasy']",1995
2,3,Grumpier Old Men,"['Comedy', 'Romance']",1995
3,4,Waiting to Exhale,"['Comedy', 'Drama']",1995
4,5,Father of the Bride Part II,['Comedy'],1995
5,6,Heat,"['Action', 'Crime', 'Thriller']",1995
6,7,Sabrina,"['Comedy', 'Romance']",1995
7,8,Tom and Huck,"['Adventure', ""Children's""]",1995
8,9,Sudden Death,['Action'],1995
9,10,GoldenEye,"['Action', 'Adventure', 'Thriller']",1995


In [5]:
print("Rating_DF")
print("---------")
display(ratings_df.dtypes)
print('\n')

print("Users_DF")
print("--------")
display(users_df.dtypes)
print('\n')

print("Movies_DF")
print("---------")
display(movies_df.dtypes)

Rating_DF
---------


UserID       int64
MovieID      int64
Rating       int64
Timestamp    int64
dtype: object



Users_DF
--------


UserID         int64
Gender        object
Age            int64
Occupation     int64
Zip-code      object
dtype: object



Movies_DF
---------


MovieID     int64
Title      object
Genres     object
Year       object
dtype: object

In [6]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
tfidf_vect = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2),
    min_df=0,
    stop_words='english'
)

tfidf_matrix = tfidf_vect.fit_transform(movies_df['Genres'])

print("Shape TfIdf Matrix = {}".format(tfidf_matrix.shape))

Shape TfIdf Matrix = (3883, 127)


In [8]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print("Shape Cosine_Sim = {}".format(cosine_sim.shape))
display(cosine_sim[:4, :4])

Shape Cosine_Sim = (3883, 3883)


array([[1.        , 0.14193614, 0.09010857, 0.1056164 ],
       [0.14193614, 1.        , 0.        , 0.        ],
       [0.09010857, 0.        , 1.        , 0.1719888 ],
       [0.1056164 , 0.        , 0.1719888 , 1.        ]])

In [9]:
# Build a 1-dimensional array with movie titles
titles = movies_df['Title']
indices = pd.Series(movies_df.index, index=movies_df['Title'])

display(indices.head(10))

Title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [10]:
def genre_recommendations(title):
    """
    Get movie recommendations based on the
    cosine similarity score of movie genres.
    """
    idx = indices[title]
    # print(idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
    # print(sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: 21]
    movie_indices = [i[0] for i in sim_scores]

    return titles.iloc[movie_indices]

In [11]:
genre_recommendations('Toy Story').head(20)

1050                      Aladdin and the King of Thieves
2072                                    American Tail, An
2073                  American Tail: Fievel Goes West, An
2285                                   Rugrats Movie, The
2286                                        Bug's Life, A
3045                                          Toy Story 2
3542                                       Saludos Amigos
3682                                          Chicken Run
3685              Adventures of Rocky and Bullwinkle, The
236                                        Goofy Movie, A
12                                                  Balto
241                                      Gumby: The Movie
310                                    Swan Princess, The
592                                             Pinocchio
612                                       Aristocats, The
700                                      Oliver & Company
876     Land Before Time III: The Time of the Great Gi...
1010          

In [12]:
genre_recommendations('Good Will Hunting').head(20)

25                                           Othello
26                                      Now and Then
29     Shanghai Triad (Yao a yao yao dao waipo qiao)
30                                   Dangerous Minds
35                                  Dead Man Walking
39                          Cry, the Beloved Country
42                                       Restoration
52                                          Lamerica
54                                           Georgia
56                             Home for the Holidays
61                                Mr. Holland's Opus
66                                          Two Bits
77                               Crossing Guard, The
79             White Balloon, The (Badkonake Sefid )
81                          Antonia's Line (Antonia)
82          Once Upon a Time... When We Were Colored
89                       Journey of August King, The
92                                   Beautiful Girls
95                                  Hate (Hain

In [13]:
genre_recommendations('Saving Private Ryan').head(20)

461            Heaven & Earth
1204        Full Metal Jacket
1214     Boat, The (Das Boot)
1222                    Glory
1545                G.I. Jane
1959      Saving Private Ryan
2358       Thin Red Line, The
2993         Longest Day, The
3559            Flying Tigers
3574    Fighting Seabees, The
3585    Guns of Navarone, The
3684             Patriot, The
40                Richard III
153            Beyond Rangoon
332         Walking Dead, The
523          Schindler's List
641        Courage Under Fire
967          Nothing Personal
979           Michael Collins
1074                  Platoon
Name: Title, dtype: object