In [1]:
#Description : Build a movie recommendation engine using python

In [3]:
#import the libraries
import pandas as  pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
from google.colab import files
uploaded =files.upload()

Saving IMDB-Movie-Data.csv to IMDB-Movie-Data.csv


In [44]:
df=pd.read_csv('IMDB-Movie-Data.csv')
df['Movie_id']=df.index
features=['Title','Description','Director','Actors','Rating','Votes','Movie_id']

In [45]:
df.shape

(1000, 13)

In [46]:
#check for any missing values in important columns
df[features].isnull().values.any()

False

In [47]:
#create a function to combine the values of the important columns into a single string
def get_important_features(data):
  important_fea=[]
  for i in range(0,data.shape[0]):
    important_fea.append(data['Actors'][i]+' '+data['Director'][i]+' '+data['Description'][i]+' '+data['Title'])

  return important_fea

In [48]:
#Create a column to hold the combined features
df['Important_f']=get_important_features(df)[1]

#show the data
df.head(3)
df['Important_f']

0      Noomi Rapace, Logan Marshall-Green, Michael Fa...
1      Noomi Rapace, Logan Marshall-Green, Michael Fa...
2      Noomi Rapace, Logan Marshall-Green, Michael Fa...
3      Noomi Rapace, Logan Marshall-Green, Michael Fa...
4      Noomi Rapace, Logan Marshall-Green, Michael Fa...
                             ...                        
995    Noomi Rapace, Logan Marshall-Green, Michael Fa...
996    Noomi Rapace, Logan Marshall-Green, Michael Fa...
997    Noomi Rapace, Logan Marshall-Green, Michael Fa...
998    Noomi Rapace, Logan Marshall-Green, Michael Fa...
999    Noomi Rapace, Logan Marshall-Green, Michael Fa...
Name: Important_f, Length: 1000, dtype: object

In [49]:
#  convert the text to a matrix to token counts
cm=CountVectorizer().fit_transform(df['Important_f'])

In [50]:
#Get the cosine similarity matrix from the count matrix
cs=cosine_similarity(cm)
#Print the cosine similarity matrix
print(cs)

[[1.         0.93895296 0.93895296 ... 0.92710507 0.9258201  0.9258201 ]
 [0.93895296 1.         0.97142857 ... 0.93541435 0.95784149 0.95784149]
 [0.93895296 0.97142857 1.         ... 0.93541435 0.95784149 0.95784149]
 ...
 [0.92710507 0.93541435 0.93541435 ... 1.         0.92233098 0.92233098]
 [0.9258201  0.95784149 0.95784149 ... 0.92233098 1.         0.94444444]
 [0.9258201  0.95784149 0.95784149 ... 0.92233098 0.94444444 1.        ]]


In [51]:
#Get the shape of the cosine similarity matrix
cs.shape

(1000, 1000)

In [56]:
#Get the title of the movie that the user likes
title='The Amazing Spider-Man'
movie_id=df[df.Title==title]['Movie_id'].values[0]
print(movie_id)

368


In [59]:
#Create a list of enumerations for the similarity score [(movie_id,similarity_score),(),....]
scores=list(enumerate(cs[movie_id]))

In [61]:
scores

[(0, 0.9271050693011068),
 (1, 0.9354143466934851),
 (2, 0.9354143466934851),
 (3, 0.9354143466934851),
 (4, 0.9223309842157772),
 (5, 0.936783914552392),
 (6, 0.8861469461982086),
 (7, 0.9354143466934851),
 (8, 0.9271050693011068),
 (9, 0.9354143466934851),
 (10, 0.8680370799067416),
 (11, 0.9223309842157772),
 (12, 0.9223309842157772),
 (13, 0.9354143466934851),
 (14, 0.9354143466934851),
 (15, 0.9162613621237827),
 (16, 0.9223309842157772),
 (17, 0.9223309842157772),
 (18, 0.9354143466934851),
 (19, 0.9354143466934851),
 (20, 0.9354143466934851),
 (21, 0.9250000000000003),
 (22, 0.9114654303753004),
 (23, 0.9354143466934851),
 (24, 0.9097816785925632),
 (25, 0.9097816785925632),
 (26, 0.936783914552392),
 (27, 0.9223309842157772),
 (28, 0.9223309842157772),
 (29, 0.9223309842157772),
 (30, 0.9223309842157772),
 (31, 0.9223309842157772),
 (32, 0.9223309842157772),
 (33, 0.9354143466934851),
 (34, 0.9136498769008702),
 (35, 0.8977310580745104),
 (36, 0.9354143466934851),
 (37, 0.92233

In [64]:
sorted_scores=sorted(scores,key = lambda x:x[1],reverse=True)
sorted_scores[1:]

[(368, 1.0000000000000002),
 (344, 0.9750356118852501),
 (104, 0.9621023987294837),
 (593, 0.9621023987294837),
 (728, 0.9500000000000003),
 (816, 0.9490707529566212),
 (43, 0.949029975678768),
 (64, 0.949029975678768),
 (70, 0.949029975678768),
 (76, 0.949029975678768),
 (89, 0.949029975678768),
 (92, 0.949029975678768),
 (97, 0.949029975678768),
 (99, 0.949029975678768),
 (102, 0.949029975678768),
 (106, 0.949029975678768),
 (129, 0.949029975678768),
 (132, 0.949029975678768),
 (150, 0.949029975678768),
 (178, 0.949029975678768),
 (226, 0.949029975678768),
 (239, 0.949029975678768),
 (249, 0.949029975678768),
 (254, 0.949029975678768),
 (255, 0.949029975678768),
 (293, 0.949029975678768),
 (295, 0.949029975678768),
 (298, 0.949029975678768),
 (300, 0.949029975678768),
 (312, 0.949029975678768),
 (317, 0.949029975678768),
 (318, 0.949029975678768),
 (352, 0.949029975678768),
 (379, 0.949029975678768),
 (388, 0.949029975678768),
 (391, 0.949029975678768),
 (405, 0.949029975678768),
 (4

In [67]:
#Create a loop to print the first 7 similar movies
j=0
print ('The 7 most recommended movies to', title ,'are:\n')
for item in sorted_scores:
  movie_title=df[df.Movie_id==item[0]]['Title'].values[0]
  print(j+1,movie_title)
  j+=1
  if j>6:
    break

The 7 most recommended movies to The Amazing Spider-Man are:

1 The Amazing Spider-Man 2
2 The Amazing Spider-Man
3 Spider-Man 3
4 The Man from U.N.C.L.E.
5 She's the Man
6 The A-Team
7 I.T.
