In [67]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


In [68]:
#Load the data

data = pd.read_csv('C:/Users/User/Desktop/Projects/Movie recommender/IMDB-Movie-Data.csv')
data.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [69]:
#count of the number of rows/movies in the data set and number of columns
data.shape

(1000, 12)

In [70]:
#Create a list of important columns for the recommendation engine
columns = ['Actors' , 'Director' , 'Genre', 'Title']
#Show data
data[columns].head()

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split
3,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",Christophe Lourdelet,"Animation,Comedy,Family",Sing
4,"Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer,"Action,Adventure,Fantasy",Suicide Squad


In [71]:
# Check for missing value
data[columns].isna().sum()

Actors      0
Director    0
Genre       0
Title       0
dtype: int64

In [72]:
print(data['Actors'][0]+' '+data['Director'][0]+ ' '+data['Genre'][0])

Chris Pratt, Vin Diesel, Bradley Cooper, Zoe Saldana James Gunn Action,Adventure,Sci-Fi


In [73]:
#Create a function to combine the values of important into a single string

def get_important_features(data):
    important_features = []
    for i in range(0, data.shape[0]):
        important_features.append(data['Actors'][i]+' '+data['Director'][i]+ ' '+data['Genre'][i]+
                                 ' '+data['Title'][i])
    return important_features


In [74]:
#Create a column to hold the combined strings

data['important_features'] = get_important_features(data)

# first 5 rows of data
data.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,"Matthew McConaughey,Reese Witherspoon, Seth Ma..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,"Will Smith, Jared Leto, Margot Robbie, Viola D..."


In [75]:
#Convert the text to a matrix of token counts
cm = CountVectorizer().fit_transform(data['important_features'])

### Cosine Similarity
Cosine Similarity is a measurement that quantifies the similarity between two or more vectors, commonly used as a similarity measurement technique.

The cosine similarity is the cosine of the angle between vectors.TIt is described mathematically as the division between the dot product of vectors and the product of the euclidean norms or magnitude of each vector.

Further understanding here: https://towardsdatascience.com/understanding-cosine-similarity-and-its-application-fd42f585296a

In [76]:
#Get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)
#Print cosine similarity matrix
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


- Each column and row is a movie observation
- Matrix contains values from 0 to 1. 
- 1 indicates perfect similarity to movie. (most likely same movie)

In [77]:
# shape of cosine similarity matrix
cs.shape

(1000, 1000)

In [78]:
#Create running index movie_id in dataset as a identifier
data['Movie_id'] = range(0,1000)
data.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,important_features,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,"Will Smith, Jared Leto, Margot Robbie, Viola D...",4


In [79]:
#Get title of movie that user likes
title = 'The Amazing Spider-Man'

#Find the movies id
movie_id = data[data.Title == title]['Movie_id'].values[0]

In [80]:
#Create a list of enumerations for the similarity score [(movie_id, similarity score)]
scores = list(enumerate(cs[movie_id]))

In [81]:
# Currently showing list of all movies with respective similarity scores in respect to
# what user has watched (The Amazing Spider-Man)
scores

[(0, 0.1767766952966369),
 (1, 0.0625),
 (2, 0.0),
 (3, 0.0),
 (4, 0.12909944487358055),
 (5, 0.1875),
 (6, 0.12126781251816648),
 (7, 0.0),
 (8, 0.18190171877724973),
 (9, 0.0668153104781061),
 (10, 0.05590169943749474),
 (11, 0.0),
 (12, 0.125),
 (13, 0.0668153104781061),
 (14, 0.06454972243679027),
 (15, 0.12126781251816648),
 (16, 0.12909944487358055),
 (17, 0.06454972243679027),
 (18, 0.0),
 (19, 0.0),
 (20, 0.06454972243679027),
 (21, 0.06454972243679027),
 (22, 0.0625),
 (23, 0.06454972243679027),
 (24, 0.12126781251816648),
 (25, 0.0),
 (26, 0.2004459314343183),
 (27, 0.0),
 (28, 0.0),
 (29, 0.12909944487358055),
 (30, 0.0668153104781061),
 (31, 0.0),
 (32, 0.125),
 (33, 0.12909944487358055),
 (34, 0.11470786693528087),
 (35, 0.11470786693528087),
 (36, 0.06454972243679027),
 (37, 0.12126781251816648),
 (38, 0.1875),
 (39, 0.0),
 (40, 0.06454972243679027),
 (41, 0.0),
 (42, 0.0625),
 (43, 0.057353933467640436),
 (44, 0.0),
 (45, 0.16770509831248423),
 (46, 0.0),
 (47, 0.0668153

In [82]:
#Sort the list
# [(movie_id, similiarity score)] 
# "key = lambda x:x[1]" - sorting by similarity score
# "key = lambda x:x[0]" - sorts by movie_id
# reverse=True - sorts in descending order, highest score first.
sorted_scores = sorted(scores, key = lambda x:x[1], reverse=True)

# starts the list from second observation in list
# as first element is similarity score with the movie itself. 
sorted_scores = sorted_scores[1:]

In [83]:
print(sorted_scores)

[(253, 0.7071067811865475), (149, 0.2672612419124244), (239, 0.2581988897471611), (344, 0.2581988897471611), (104, 0.25), (821, 0.23570226039551587), (78, 0.22360679774997896), (558, 0.2182178902359924), (739, 0.21320071635561041), (314, 0.20851441405707477), (767, 0.20412414523193154), (26, 0.2004459314343183), (55, 0.2004459314343183), (92, 0.2004459314343183), (363, 0.2004459314343183), (718, 0.2004459314343183), (176, 0.2), (313, 0.19611613513818404), (179, 0.19364916731037082), (303, 0.19364916731037082), (324, 0.19364916731037082), (379, 0.19364916731037082), (600, 0.19364916731037082), (694, 0.19364916731037082), (728, 0.19364916731037082), (5, 0.1875), (38, 0.1875), (294, 0.1875), (345, 0.1875), (389, 0.1875), (432, 0.1875), (529, 0.1875), (537, 0.1875), (581, 0.1875), (758, 0.1875), (770, 0.1875), (969, 0.1875), (8, 0.18190171877724973), (65, 0.18190171877724973), (107, 0.18190171877724973), (203, 0.18190171877724973), (214, 0.18190171877724973), (388, 0.18190171877724973), (3

In [87]:
#Create a loop to print the first 7 similar movies
j = 0 
print('The 7 most recommended movies to', title, 'are:\n')
for item in sorted_scores:
    movie_title = data[data.Movie_id == item[0]]['Title'].values(0)
    print(j+1, movie_title)
    j = j+1
    if j>6:
        break
        


The 7 most recommended movies to The Amazing Spider-Man are:



TypeError: 'numpy.ndarray' object is not callable

In [86]:
print(data[data.Movie_id == item[0]]['Title'].values(0))

TypeError: 'numpy.ndarray' object is not callable

253