<a href="https://colab.research.google.com/github/guilhermelaviola/MovieRecommendation/blob/main/MovieRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing all the necessary libraries:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [3]:
# Importing the 'credits' and 'movies' datasets:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

In [11]:
# Displaying credits table shape and dataset:
print('Credits dataframe: ', credits.shape)
print(credits.head(10))

Credits dataframe:  (1154, 4)
   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   
5       559                              Spider-Man 3   
6     38757                                   Tangled   
7     99861                   Avengers: Age of Ultron   
8       767    Harry Potter and the Half-Blood Prince   
9    209112        Batman v Superman: Dawn of Justice   

                                                cast  \
0  [{"cast_id": 242, "character": "Jake Sully", "...   
1  [{"cast_id": 4, "character": "Captain Jack Spa...   
2  [{"cast_id": 1, "character": "James Bond", "cr...   
3  [{"cast_id": 2, "character": "Bruce Wayne / Ba...   
4  [{"cast_id": 5, "character": "John Carter", "c...   
5  [{"

In [12]:
# Displaying movies table shape and dataset:
print('Movies dataframe: ', movies.shape)
print(movies.head(10))

Movies dataframe:  (4803, 20)
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
5  258000000  [{"id": 14, "name": "Fantasy"}, {"id": 28, "na...   
6  260000000  [{"id": 16, "name": "Animation"}, {"id": 10751...   
7  280000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
8  250000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
9  250000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                            homepage      id  \
0                        http://www.avatarmovie.com/   19995   
1       http://disney.go.com/disneypictures/pirates/     285   
2        http://www.sonypictures.com/mov

In [22]:
# Creating a 'credits_column_renamed' table based on the 'credits' table, with the column
# 'movie_id' renamed as 'id':
credits_column_renamed = credits.rename(index = str, columns = {'movie_id' : 'id'})

In [29]:
# Merging 'movies' and 'credits_column_renamed' datasets and
# displaying their shape and first 10 entries:
movies_merge = movies.merge(credits_column_renamed, on = 'id')
print('movies_merge dataframe:', movies_merge.shape)
print(movies_merge.head())

movies_merge dataframe: (1154, 23)
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...       

In [30]:
movies_cleaned = movies_merge.drop(columns = ['homepage',
                                              'title_x',
                                              'title_y',
                                              'status',
                                              'production_countries'])
print(movies_cleaned.head())
print(movies_cleaned.info())
print(movies_cleaned.head(1)['overview'])

      budget                                             genres      id  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   19995   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...     285   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...  206647   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   49026   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "name": "spy"}, {"id": 818, "name...                en   
3  [{"id": 849, "name": "dc comics"}, {"id": 853,...                en   
4  [{"id": 818, "name": "based on novel"}, {"id":...                en   

                             original_title  \
0                                    Avatar   
1  Pirates

In [33]:
tfv = TfidfVectorizer(min_df = 3,
                      max_features = None,
                      strip_accents = 'unicode',
                      analyzer = 'word',
                      token_pattern = r'\w{1,}',
                      ngram_range = (1, 3),
                      stop_words = 'english')

In [34]:
# Fitting the TF_IDF on the 'overview' text:
tfv_matrix = tfv.fit_transform(movies_cleaned['overview'])
print(tfv_matrix)
print(tfv_matrix.shape)

  (0, 23)	0.3055248847504639
  (0, 502)	0.2869398510888036
  (0, 103)	0.21633149942688157
  (0, 2076)	0.279874149216392
  (0, 1898)	0.2869398510888036
  (0, 1057)	0.24810282432163674
  (0, 2766)	0.25915680863305984
  (0, 1743)	0.18295171711647473
  (0, 2864)	0.25915680863305984
  (0, 1760)	0.279874149216392
  (0, 771)	0.2869398510888036
  (0, 1670)	0.2737535598557087
  (0, 439)	0.24198223496095347
  (0, 22)	0.3055248847504639
  (1, 848)	0.4135338272151473
  (1, 828)	0.25219555187214754
  (1, 1233)	0.4135338272151473
  (1, 1568)	0.1987330147207551
  (1, 540)	0.2951725221227923
  (1, 688)	0.32195702092930417
  (1, 275)	0.4135338272151473
  (1, 1600)	0.30721644753756894
  (1, 398)	0.3141764264592021
  (2, 2374)	0.23511634966680003
  (2, 321)	0.24196397274578993
  :	:
  (1152, 2411)	0.21647363600781597
  (1152, 955)	0.19013751732895173
  (1152, 1232)	0.1596658687695317
  (1152, 203)	0.14894079762998041
  (1152, 780)	0.21647363600781597
  (1152, 975)	0.11959145944772114
  (1152, 2409)	0.126

In [35]:
# Computing the Sigmoid Kernel
sigmoid = sigmoid_kernel(tfv_matrix, tfv_matrix)
print(sigmoid[0])

[0.7617299  0.76159416 0.76159416 ... 0.76159416 0.76159416 0.76159416]


In [45]:
# Reverse mapping of indexes and movie titles:
indexes = pd.Series(movies_cleaned.index,
                    index = movies_cleaned['original_title']).drop_duplicates()
print(indexes)
# print(indexes['Newlyweds'])
# print(sigmoid[4799])
print(list(enumerate(sigmoid[indexes['Newlyweds']])))
print(sorted(list(enumerate(sigmoid[indexes['Newlyweds']])),
             key = lambda x : x[1], reverse = True))


original_title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
American Hustle                             1149
The Proposal                                1150
Double Jeopardy                             1151
Back to the Future Part II                  1152
Lucy                                        1153
Length: 1154, dtype: int64


KeyError: ignored

In [43]:
def give_recommendations(title, sigmoid = sigmoid):
  # Getting the index corresponding to original_title:
  index = indexes[title]

  # Getting the pairwise similarity scores:
  sigmoid_scores = list(enumerate(sigmoid[index]))

  # Sorting the movies:
  sigmoid_scores = sorted(sigmoid_scores, key = lambda x : x[1], reverse = True)

  # Scores of the 10 most similar movies:
  sigmoid_scores = sigmoid_scores[1:11]

  # Movie indexes:
  movie_indexes = [i[0] for i in sigmoid_scores]

  # Top 10 most similar movies:
  return movies_cleaned['original_title'].iloc[movie_indexes]