<a href="https://colab.research.google.com/github/cisco00/population_prediction/blob/master/Recommendation_model_base_on_Overview_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import warnings
warnings.filterwarnings('ignore')

### Loading libraries

In [29]:
import os, types
import pandas as pd

import joblib
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval

### Function for loading dataset

In [30]:
movies = pd.read_csv('movies1.csv', low_memory=False)
metadata = movies
metadata.head()


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",10/30/1995,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/15/1995,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/22/1995,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",12/22/1995,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/10/1995,76578911,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173


### Columns

In [31]:
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

### dealing with missing values

In [32]:
metadata.isna().sum()

adult                       0
belongs_to_collection    4306
budget                      0
genres                      0
homepage                 4836
id                          0
imdb_id                     0
original_language           0
original_title              0
overview                   21
popularity                  0
poster_path                21
production_companies        0
production_countries        0
release_date                4
revenue                     0
runtime                     6
spoken_languages            0
status                      6
tagline                  1302
title                       0
video                       0
vote_average                0
vote_count                  0
dtype: int64

In [33]:
for column in metadata:
    if metadata[column].isna().any():
        metadata[column]=metadata[column].fillna(metadata[column].mode()[0])
    else:
        metadata[column]=metadata[column].fillna(metadata[column].mean)

In [34]:
metadata.isna().sum()

adult                    0
belongs_to_collection    0
budget                   0
genres                   0
homepage                 0
id                       0
imdb_id                  0
original_language        0
original_title           0
overview                 0
popularity               0
poster_path              0
production_companies     0
production_countries     0
release_date             0
revenue                  0
runtime                  0
spoken_languages         0
status                   0
tagline                  0
title                    0
video                    0
vote_average             0
vote_count               0
dtype: int64

### Using overview to build the model 

In [35]:
metadata['overview']

0       Led by Woody, Andy's toys live happily in his ...
1       When siblings Judy and Peter discover an encha...
2       A family wedding reignites the ancient feud be...
3       Cheated on, mistreated and stepped on, the wom...
4       Just when George Banks has recovered from his ...
                              ...                        
5152    In late nineteenth century Charante, Protestan...
5153    The intriguing relationship between three desp...
5154    A compilation of 60s films on the bomb and wha...
5155    Charley Davis, against the wishes of his mothe...
5156    Based on the HG Wells story. The world is deli...
Name: overview, Length: 5157, dtype: object

In [36]:
metadata['length_overview'] = metadata['overview'].apply(lambda X: len(str(X).split(' ')))

In [37]:
metadata

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,length_overview
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",10/30/1995,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Based on a true story.,Toy Story,False,7.7,5415,50
1,False,"{'id': 645, 'name': 'James Bond Collection', '...",65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://phantasm.com,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/15/1995,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413,67
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",http://phantasm.com,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",12/22/1995,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92,56
3,False,"{'id': 645, 'name': 'James Bond Collection', '...",16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://phantasm.com,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",12/22/1995,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34,45
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",http://phantasm.com,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/10/1995,76578911,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5152,False,"{'id': 645, 'name': 'James Bond Collection', '...",0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://phantasm.com,64310,tt0216689,fr,Les Destinées sentimentales,"In late nineteenth century Charante, Protestan...",0.117912,/gQbevurGxdYPJSx2NauH4Qi8EZB.jpg,[],"[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",7/12/2000,0,180.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,Based on a true story.,Les Destinées sentimentales,False,4.8,3,21
5153,False,"{'id': 645, 'name': 'James Bond Collection', '...",0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",http://phantasm.com,25501,tt0098994,en,"After Dark, My Sweet",The intriguing relationship between three desp...,7.349189,/3hjcHNtWn9T6jVGXgNXyCsMWBdj.jpg,"[{'name': 'Avenue Pictures Productions', 'id':...","[{'iso_3166_1': 'US', 'name': 'United States o...",8/24/1990,0,114.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,All they risked was everything.,"After Dark, My Sweet",False,6.5,17,20
5154,False,"{'id': 645, 'name': 'James Bond Collection', '...",0,"[{'id': 99, 'name': 'Documentary'}, {'id': 36,...",http://phantasm.com,26851,tt0083590,en,The Atomic Cafe,A compilation of 60s films on the bomb and wha...,1.706144,/wHqKPJypv0GMeQTrbwetLndD8Sh.jpg,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",3/17/1982,0,86.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Based on a true story.,The Atomic Cafe,False,7.3,17,36
5155,False,"{'id': 645, 'name': 'James Bond Collection', '...",0,"[{'id': 18, 'name': 'Drama'}]",http://phantasm.com,17487,tt0039204,en,Body and Soul,"Charley Davis, against the wishes of his mothe...",1.990877,/eTYxWM0AO4bdU2DzPavGXw1Xppb.jpg,"[{'name': 'Enterprise Productions', 'id': 3333}]","[{'iso_3166_1': 'US', 'name': 'United States o...",11/9/1947,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,All for you...,Body and Soul,False,6.3,25,46


In [38]:
countervect = CountVectorizer(stop_words='english')
count = countervect.fit_transform(metadata['overview'])

In [39]:
count.shape

(5157, 22645)

In [40]:
countervect.get_feature_names()[5000:5020]

['curse',
 'cursed',
 'curses',
 'cursor',
 'curt',
 'curtail',
 'curtain',
 'curtains',
 'curtin',
 'curtis',
 'curvier',
 'curzio',
 'curzon',
 'cusack',
 'cushing',
 'cusp',
 'custer',
 'custodian',
 'custody',
 'customer']

In [41]:
cosin_sim = linear_kernel(count, count)

In [42]:
cosin_sim[:5]

array([[49.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1., 44.,  2., ...,  0.,  0.,  3.],
       [ 0.,  2., 34., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.]])

In [43]:
tfidf_vect = TfidfVectorizer(stop_words = 'english')

In [44]:
tfidf_matrix = tfidf_vect.fit_transform(metadata['overview'])

In [45]:
tfidf_matrix.shape

(5157, 22645)

In [46]:
tfidf_vect.get_feature_names()[5000:5020]

['curse',
 'cursed',
 'curses',
 'cursor',
 'curt',
 'curtail',
 'curtain',
 'curtains',
 'curtin',
 'curtis',
 'curvier',
 'curzio',
 'curzon',
 'cusack',
 'cushing',
 'cusp',
 'custer',
 'custodian',
 'custody',
 'customer']

In [53]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosin_sim[1]

array([ 1., 44.,  2., ...,  0.,  0.,  3.])

In [54]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [55]:
cosin_sim[:5]

array([[49.,  1.,  0., ...,  0.,  0.,  0.],
       [ 1., 44.,  2., ...,  0.,  0.,  3.],
       [ 0.,  2., 34., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.]])

In [56]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [57]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [59]:
get_recommendations('')

2486              eXistenZ
1506    The Innocent Sleep
3872    Dungeons & Dragons
5132            Panic Room
3056      Any Given Sunday
8             Sudden Death
4083             Manhunter
363               Maverick
2249        Glen or Glenda
1951           BASEketball
Name: title, dtype: object