In [7]:
import warnings
warnings.filterwarnings('ignore')

****Loading libraries****

In [8]:
import os, types
import pandas as pd
from botocore.client import Config
import neptune
import joblib
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
from sklearn.model_selection import train_test_split

****Loading Dataset****

In [9]:
recom_model_mov = pd.read_csv('../input/the-movies-dataset/movies_metadata.csv', low_memory=True)
recom_model_key = pd.read_csv('../input/the-movies-dataset/keywords.csv', low_memory=True)
recom_model_credits = pd.read_csv('../input/the-movies-dataset/credits.csv', low_memory=True)

recom_model_mov = recom_model_mov.drop([19730, 29503, 35587])

recom_model_mov['id'] = recom_model_mov['id'].astype(int)
recom_model_key['id'] = recom_model_key['id'].astype(int)
recom_model_credits['id'] =recom_model_credits['id'].astype(int)

**Merging Dataframe tables by id**

In [10]:
recom_model_mov = recom_model_mov.merge(recom_model_credits, on='id')
recom_model_mov = recom_model_mov.merge(recom_model_key, on='id')

In [11]:
recom_model_mov.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


****Checking for missing values****

In [12]:
recom_model_mov.isna().sum()

adult                        0
belongs_to_collection    42054
budget                       0
genres                       0
homepage                 38619
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   995
popularity                   4
poster_path                399
production_companies         4
production_countries         4
release_date                88
revenue                      4
runtime                    268
spoken_languages             4
status                      86
tagline                  25845
title                        4
video                        4
vote_average                 4
vote_count                   4
cast                         0
crew                         0
keywords                     0
dtype: int64

****Function for treating missing values****

In [13]:
for column in recom_model_mov:
    if recom_model_mov[column].isna().any():
        recom_model_mov[column]=recom_model_mov[column].fillna(recom_model_mov[column].mode()[0])
    else:
        recom_model_mov[column]=recom_model_mov[column].fillna(recom_model_mov[column].mean)

****Checking if there is no missing values****

In [14]:
recom_model_mov.isna().sum()

adult                    0
belongs_to_collection    0
budget                   0
genres                   0
homepage                 0
id                       0
imdb_id                  0
original_language        0
original_title           0
overview                 0
popularity               0
poster_path              0
production_companies     0
production_countries     0
release_date             0
revenue                  0
runtime                  0
spoken_languages         0
status                   0
tagline                  0
title                    0
video                    0
vote_average             0
vote_count               0
cast                     0
crew                     0
keywords                 0
dtype: int64

****getting columns****

In [15]:
recom_model_mov.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')

<!-- ****Using selected to build the model****  -->

****Selected features****

In [16]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    recom_model_mov[feature] = recom_model_mov[feature].apply(literal_eval)

In [17]:
recom_model_mov.to_csv('recommendation_data')

****Function for Extraction of Required Information****

In [18]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [19]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

****Applying the function to get the names of directors and list of movie there were able to direct****

In [20]:
# Define new director, cast, genres and keywords features that are in a suitable form.
recom_model_mov['director'] = recom_model_mov['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    recom_model_mov[feature] = recom_model_mov[feature].apply(get_list)

In [21]:
recom_model_mov[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


****Function for Cleaning the dataset****

In [22]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [23]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    recom_model_mov[feature] = recom_model_mov[feature].apply(clean_data)

In [24]:
def create_soup(x):
    return " ".join(x['keywords'])+ " "+" ".join(x['director']+" "+" ".join(x['genres']))

In [25]:
# Create a new soup feature
recom_model_mov['soup'] = recom_model_mov.apply(create_soup, axis=1)

In [26]:
recom_model_mov[['soup']].head()

Unnamed: 0,soup
0,jealousy toy boy j o h n l a s s e t e r a n...
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger h o w ...
3,basedonnovel interracialrelationship singlemot...
4,baby midlifecrisis confidence c h a r l e s s ...


In [27]:
tfidf_count = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_count.fit_transform(recom_model_mov['soup']) 

In [28]:
tfidf_matrix.shape

(46628, 9878)

In [29]:
cosine_similarity_matric = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [30]:
metadata = recom_model_mov.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [31]:
def get_recommendations(title, cosine_sim=cosine_similarity_matric):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]


In [32]:
get_recommendations('The Godfather', cosine_similarity_matric)

6493                         Avanti!
14548               Fun Is Beautiful
18221          Suddenly, Last Winter
28683         In the Name of the Law
35564               The Broken Tower
13859                 Shall We Kiss?
2492     Lovers of the Arctic Circle
6837                       Aparajito
8337                          Closer
9024                       Ghost Dad
Name: title, dtype: object

****Loading dataset****

In [34]:
from collections import defaultdict

from surprise import SVD
from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


recom_mod = pd.read_csv('recommendation_data')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(recom_mod):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

# Precision and recall can then be averaged over all users
print(sum(prec for prec in precisions.values()) / len(precisions))
print(sum(rec for rec in recalls.values()) / len(recalls))
