In [34]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re 

In [35]:
movies = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_movies.csv")
credit = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_credits.csv")

In [36]:
credit.columns

In [37]:
credit['cast'].iloc[0]

In [38]:
credit.rename({'movie_id' : 'id'}, axis=1, inplace=True)

In [39]:
## Join the 2 datasets based on movie id
movies = movies.merge(credit, on='id')


In [40]:
movies.head(1)

# Dempgraphic Filtering
We'll use the IMDB's wieghted rate formula to calculate the weighted rates o the given movies\
The formula is as follows: 


In [41]:
## Calculate the mean of votes accross the dataste 
c = movies['vote_average'].mean()
print("Mean of votes: {}".format(c))

In [42]:
## The minimum number of votes to be listed in the chart 
## for a movie to feature in the charts, 
## it must have more votes than at least 90% of the movies in the list.

m = movies['vote_count'].quantile(0.9)
print(" for a movie to feature in the charts, it must have more than {} votes".format(int(m)))

In [43]:
## Get the movies with more than 1838 votes
q_movies = movies.loc[movies['vote_count']>=m]
print(q_movies.shape)

In [44]:
## Define a function to calculate the weighted rating of each movie

def weighted_rating(x,c=c, m=m):
    v = x['vote_count']
    r = x['vote_average']
    
    wr = (v/(v+m)*r) + (m/(v+m)*c)
    return wr

In [45]:
## Create a new column and apply the weighted_rating function on the the dataset 
q_movies['wr'] = q_movies.apply(weighted_rating, axis=1)

In [46]:
## Order the movies by WR score and select the top 10 
q_movies.sort_values(['wr'], ascending=False, inplace=True )

In [47]:
q_movies[['title_x', 'vote_count', 'vote_average', 'wr']].head(10)

# Content Based Filtering 

In [48]:
## Get keywords and genres out of the dictionnaries in the features 
def get_cast(movie):
    items = re.findall(r' \"name\": \"([A-Za-z]*)\"', movie)
    return " ".join(items)


In [49]:
## Get keywords and genres out of the dictionnaries in the features 
def get_items(movie):
    items = re.findall(" \"name\": \"([A-Za-z]*)\"}", movie)
    return " ".join(items)

genres = movies['genres'].apply(get_items)
print(genres)
    

In [50]:
keywords = movies['keywords'].apply(get_items)
print(keywords)
    

In [51]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [52]:
cast = movies['crew'].apply(get_list)
print(cast)

In [53]:
movies_feats = pd.DataFrame()
movies_feats['genres'], movies_feats['keywords'], movies_feats[['overview', 'title']] = genres,  keywords, movies[['overview', 'title_x']]

In [54]:
movies_feats = movies_feats[['title', 'overview','genres', 'keywords']]

In [55]:
movies_feats.head()

In [56]:
## Function that cleans the data
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace("\s\s+", " ")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace("\s\s+", " "))
        else:
            return ''
        
        
        ############################# CLEAN THIS FCT

In [57]:
for feature in movies_feats.columns:
    movies_feats[feature] = movies_feats[feature].apply(clean_data)

In [58]:
movies_feats

In [59]:
def create_soup(x):
    return "".join(x['title']) + "".join(x['overview'])+ "".join(x['genres'])+ "".join(x['keywords'])
movies_feats['soup'] = movies_feats.apply(create_soup, axis=1)

In [60]:
movies_feats['soup']

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))
count_matrix = vectorizer.fit_transform(movies_feats['soup'])

In [63]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [64]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2['title'].iloc[movie_indices]

In [65]:
df2 = movies_feats.reset_index()
indices = pd.Series(movies_feats.index, index=movies_feats['title'])

In [66]:
movies_feats['title']

In [67]:
get_recommendations('avatar', cosine_sim)