In [54]:
# A MACHINE LEARNING TECHNIQUE WHICH SUGGESTS MOVIES RELEVANT TO A GIVEN SEARCHED MOVIE

In [55]:
# IMPORT DEPENDENCES

In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib

%matplotlib inline

In [57]:
df = pd.read_csv('movies.csv')

In [58]:
# Display the first 3 movies

In [59]:
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes


In [60]:
# Display key information associated with the given data

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [62]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline','cast','director']

In [63]:
# Check for NULL values

In [64]:
df[selected_features].isnull().sum()

genres       28
keywords    412
tagline     844
cast         43
director     30
dtype: int64

In [65]:
# Fill in null values

In [66]:
for feature in selected_features:
    df[feature] = df[feature].fillna('')

In [67]:
# Confirm absence of NULL values

In [68]:
df[selected_features].isnull().sum()

genres      0
keywords    0
tagline     0
cast        0
director    0
dtype: int64

In [69]:
# Combine selected features

In [70]:
combined_features = df['genres']+''+df['keywords']+''+df['tagline']+''+df['cast']+''+df['director']

In [71]:
# Display first 5 lines of the combined features

In [72]:
combined_features.head()

0    Action Adventure Fantasy Science Fictioncultur...
1    Adventure Fantasy Actionocean drug abuse exoti...
2    Action Adventure Crimespy based on novel secre...
3    Action Crime Drama Thrillerdc comics crime fig...
4    Action Adventure Science Fictionbased on novel...
dtype: object

In [73]:
# we convert the text data into feature vectors to ease movie data pattern recognition

In [74]:
vectorizer = TfidfVectorizer()

In [75]:
featured_vectors = vectorizer.fit_transform(combined_features)

In [76]:
# Display featured vectors

In [77]:
print(featured_vectors)

  (0, 3583)	0.16941894714909375
  (0, 20729)	0.2738578969459924
  (0, 16614)	0.15696073877453268
  (0, 14550)	0.2236809363648068
  (0, 23267)	0.16128139780622516
  (0, 26547)	0.19638671147741732
  (0, 22464)	0.20207089568711745
  (0, 21487)	0.21550128478931546
  (0, 27540)	0.19771357974524176
  (0, 27182)	0.23480088356130552
  (0, 21521)	0.1562356665943368
  (0, 18558)	0.2612168335104853
  (0, 18164)	0.08690831799482265
  (0, 27161)	0.1261988657937151
  (0, 24197)	0.07518543993419265
  (0, 22778)	0.2738578969459924
  (0, 4676)	0.2452909770424961
  (0, 26334)	0.13059723071916296
  (0, 22916)	0.33668756406929184
  (0, 10193)	0.16532432420812987
  (0, 4456)	0.21799675215510664
  (0, 9304)	0.2738578969459924
  (0, 21836)	0.09966592997173944
  (0, 8936)	0.11806131645084653
  (0, 444)	0.09109355212252294
  :	:
  (4801, 5449)	0.31510541707008866
  (4801, 22170)	0.31510541707008866
  (4801, 27361)	0.31510541707008866
  (4801, 27513)	0.3005604008026175
  (4801, 8121)	0.2611505149930138
  (4801,

In [78]:
# GET THE SIMILARITY SCORES USING COSINE SIMILARITY 
# NB: Cosine similarity measures the cosine of the angle btw 2 vectors projected
# in multi-dimensional space and tends to be more similar with small angle measures between 2 contextual angles

In [79]:
similarity = cosine_similarity(featured_vectors)

In [80]:
# Display similarity scores

In [81]:
print(similarity)

[[1.         0.06865296 0.01492221 ... 0.         0.         0.        ]
 [0.06865296 1.         0.02799128 ... 0.01243107 0.         0.        ]
 [0.01492221 0.02799128 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.01243107 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [82]:
# Display dimension of similarity score

In [83]:
similarity.shape

(4803, 4803)

In [84]:
# Define a function which suggests movies

In [85]:
def recommend_movies(movie_name=input('Enter your favorite movie name: ')):
    # list all movie titles given in the dataset
    titles = df['title'].tolist()
    
    # Find the close match for the movie name entered by the user as difflib would compare the sequences already
    # established by the cosine simiarity
    close_match = difflib.get_close_matches(movie_name,titles)
    closest_match = close_match[0]
    
    # finding the index of the movie with the closest match
    movie_index = df[df['title'] == closest_match]['index'].values[0]
    
    # Get a list of similar movies
    # Also add a counter to the similarity_scores on the movie index by implimenting the enumerate function with a list
    similarity_scores = list(enumerate(similarity[movie_index]))
    
    # sort the movies based on their similarity scores
    sorted_similarity_scores = sorted(similarity_scores, key=lambda x : x[1], reverse=True)
        
    print('Movies suggested to you: \n')
    i = 1
    for movie in sorted_similarity_scores:
        index = movie[0]
        title_from_index = df[df['index'] == index]['title'].values[0]
        if i <= 10:
            print(f'{i}. {title_from_index}')
            i+=1

Enter your favorite movie name: rush hour


In [86]:
# Call function and print movie suggestions

In [87]:
movie_collections = recommend_movies()
print(movie_collections)

Movies suggested to you: 

1. Rush Hour 3
2. Rush Hour 2
3. The White Countess
4. What Dreams May Come
5. Minority Report
6. Extremely Loud & Incredibly Close
7. Money Talks
8. The Spy Next Door
9. Robin Hood
10. Flash Gordon
None


In [88]:
# Import pickle to have it saved and reused by anyone

In [89]:
import pickle

In [90]:
file_name = "suggest_movies.sav"

In [91]:
# Save the model

In [92]:
pickle.dump(recommend_movies, open(file_name, 'wb'))

In [93]:
# Reuse the model

In [94]:
loaded_model = pickle.load(open('suggest_movies.sav', 'rb'))

In [None]:
prediction = loaded_model('thor')

Movies suggested to you: 

1. Thor
2. Thor: The Dark World
3. The Avengers
4. Cinderella
5. Captain America: The Winter Soldier
6. Avengers: Age of Ultron
7. Jack Ryan: Shadow Recruit
8. Deadpool
9. Iron Man 2
10. Spider-Man 2
