# Imports

In [2]:
import os
import pandas as pd
import numpy as np
import math
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Content Based Recommender System

## Data Process

In [3]:
Data = pd.read_csv(r"archive\movies_metadata.csv")
Data["genres"] = Data["genres"].apply(lambda x: " ".join([List["name"] for List in ast.literal_eval(x)]))
Data["vote_count"] = Data["vote_count"].fillna(0) #Fill na's for movies with no votes

Links = pd.read_csv(r"archive\links_small.csv")

KeyWords = pd.read_csv(r"archive\keywords.csv")
KeyWords["keywords"] = KeyWords["keywords"].apply(lambda x: " ".join([List["name"] for List in ast.literal_eval(x)]))

Data = pd.read_csv(r"archive\movies_metadata.csv")
Data = Data.drop([19730, 29503, 35587])

Credits = pd.read_csv(r"archive\credits.csv")
Credits["characters"] = Credits["cast"].head().apply(lambda x: " ".join([List["character"] for List in ast.literal_eval(x)[0:10]]))
Credits["director"] = Credits["crew"].apply(lambda x: " ".join([List["name"] for List in ast.literal_eval(x)[0:10] if List["job"] == "Director"]))

Data["tmdbId"] = Data["id"].astype("int")
Data["id"] = Data["id"].astype("int")
Data = pd.merge(Data, Links, on='tmdbId')
Data = pd.merge(Data, KeyWords, on="id")
Data = pd.merge(Data, Credits, on ="id")


Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.



### Start with overview and tagline

In [5]:
Data["overview"] = Data["overview"].fillna('')
Data["tagline"] = Data["tagline"].fillna('')
Data["keywords"] = Data["keywords"].fillna('')
Data["characters"] = Data["characters"].fillna('')
Data["director"] = Data["director"].fillna('')

Data["Description"] = Data["overview"] + Data["keywords"] +  Data["keywords"] + Data["tagline"] + Data["characters"] + Data["characters"] + Data["director"] + Data["director"] + Data["director"]

In [6]:
def GetSimilarMovies(Data, Movie):
    Temp = Data.copy()
    Idx = Temp.index[Temp["title"] == Movie].values.tolist()[0]
    
    Vectoriser = TfidfVectorizer(analyzer="word", stop_words="english")
    Vectoriser.fit(Temp["Description"])
    
    Mat = Vectoriser.transform(Temp["Description"])
    Similarities = linear_kernel(Mat, Mat)
    
    Similars = [i for i in sorted( [(x,i) for (i,x) in enumerate(Similarities[Idx])], reverse=True )[1:40]]
    Index = [i[1] for i in Similars]
    
    Temp = Temp.loc[Index]
    
    MinVotes = pd.qcut(Temp["vote_count"], 4, duplicates="drop").values.value_counts().index[-1].left
    Temp = Temp[Temp["vote_count"] >= MinVotes]
    
    RatingWeight = Temp["vote_count"] / (Temp["vote_count"] + MinVotes)
    AverageWeight = MinVotes / (Temp["vote_count"] + MinVotes)
    
    Temp["weighted_ratings"] = RatingWeight*Temp["vote_average"] + Temp["vote_average"].mean()*AverageWeight
    
    return Temp.loc[Index].sort_values("weighted_ratings", ascending = False).head(10)[["title", "vote_count", "vote_average", "release_date"]]

In [7]:
Data[Data["title"].str.contains("Superman")]["title"]

2131                              Superman
2132                           Superman II
2133                          Superman III
2134      Superman IV: The Quest for Peace
6521                      Superman Returns
7724                Waiting for 'Superman'
7811                     All Star Superman
7979             Superman and the Mole-Men
9024    Batman v Superman: Dawn of Justice
Name: title, dtype: object

In [8]:
GetSimilarMovies(Data, "The Dark Knight")



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike



Unnamed: 0,title,vote_count,vote_average,release_date
699,The Godfather,6024.0,8.5,1972-03-14
48,The Usual Suspects,3334.0,8.1,1995-07-19
8031,The Dark Knight Rises,9263.0,7.6,2012-07-16
6218,Batman Begins,7511.0,7.5,2005-06-10
7583,Kick-Ass,4747.0,7.1,2010-03-22
3049,X-Men,4172.0,6.8,2000-07-13
5538,Spider-Man 2,4432.0,6.7,2004-06-25
8419,Man of Steel,6462.0,6.5,2013-06-12
9004,Suicide Squad,7717.0,5.9,2016-08-02
9024,Batman v Superman: Dawn of Justice,7189.0,5.7,2016-03-23
