In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from ydata_profiling import ProfileReport
from movie_recommender_system import paths
import pandas as pd

In [3]:
movie_data_path = paths.data_raw_dir() / "tmdb_5000_movie_dataset/movies.csv"
credits_data_path = paths.data_raw_dir() / "tmdb_5000_movie_dataset/credits.csv"

In [4]:
df_movies = pd.read_csv(movie_data_path)
profile_movies = ProfileReport(df_movies, title="Profiling Report")
#profile_movies.to_file(paths.reports_dir() / "00_raw_movies.html")

In [5]:
df_credits = pd.read_csv(credits_data_path)
profile_credits = ProfileReport(df_credits, title="Profiling Report")
#profile_credits.to_file(paths.reports_dir() / "00_raw_credits.html")

In [6]:
df = df_movies.merge(df_credits, on="title")

df.to_csv(paths.data_processed_dir() / "dataset.csv", index=False)

In [7]:
list(df.columns)

['budget',
 'genres',
 'homepage',
 'id',
 'keywords',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'vote_average',
 'vote_count',
 'movie_id',
 'cast',
 'crew']

In [13]:
import ast
import nltk
from nltk.stem import PorterStemmer

def extract_data_from_dict_string(string_dict, key) -> list:
    value_list = []
    for item in ast.literal_eval(string_dict):
        value_list.append(item[key])
    return value_list

def extract_cast(string_dict: str, key: str, count: int) -> list:
    value_list = []
    counter = 0
    for item in ast.literal_eval(string_dict):
        if counter == count:
            break
        else:
            value_list.append(item[key])
            counter += 1
    return value_list

def extract_crew(string_dict: str, key: str, crew_jobs: list) -> list:
    value_list = []
    for item in ast.literal_eval(string_dict):
        if item['job'] in crew_jobs:
            value_list.append(item[key])
    return value_list

def remove_space(string_list: list) -> list:
    return [x.replace(" ", "") for x in string_list]

def stems(text):
    ps = PorterStemmer()
    T = []
    for i in text.split():
        T.append(ps.stem(i))
    
    return " ".join(T)

def processing_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = df.dropna()
    # list string columns
    df["genres"] = df["genres"].apply(lambda x: extract_data_from_dict_string(x, "name"))
    df["keywords"] = df["keywords"].apply(lambda x: extract_data_from_dict_string(x, "name"))
    df["production_companies"] = df["production_companies"].apply(lambda x: extract_data_from_dict_string(x, "name"))
    df["production_countries"] = df["production_countries"].apply(lambda x: extract_data_from_dict_string(x, "name"))

    df["genres"] = df["genres"].apply(lambda x: remove_space(x))
    df["keywords"] = df["keywords"].apply(lambda x: remove_space(x))
    df["production_companies"] = df["production_companies"].apply(lambda x: remove_space(x))
    df["production_countries"] = df["production_countries"].apply(lambda x: remove_space(x))

    cast_count = 3
    df.loc[:, 'cast'] = df['cast'].apply(lambda x: extract_cast(x, 'name', cast_count))
    crew_jobs_list = ['Director']
    df.loc[:, 'crew'] = df['crew'].apply(lambda x: extract_crew(x, 'name', crew_jobs_list))

    df.loc[:, 'cast'] = df['cast'].apply(lambda x: remove_space(x))
    df.loc[:, 'crew'] = df['crew'].apply(lambda x: remove_space(x))


    df.loc[:, 'genres'] = df['genres'].apply(lambda x: remove_space(x))

    df.loc[:, 'overview'] = df['overview'].apply(lambda x: x.split())

    df["tags"] = df["overview"] + df["genres"] + df["keywords"] + df["production_companies"] + df["production_countries"] + df["cast"] + df["crew"]
    df = df[['movie_id', 'title', 'tags']]

    df["tags"] = df["tags"].apply(lambda x: " ".join(x).lower())
    df["tags"] = df["tags"].apply(stems)

    return df

In [14]:
df_processed = processing_features(df)

In [15]:
df_processed

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4779,2292,Clerks,conveni and video store clerk dant and randal ...
4787,255266,Dry Spell,sasha tri to get her soon-to-b ex husband kyle...
4797,157185,Tin Can Man,"recent dump by hi girlfirend for anoth man, wo..."
4802,14337,Primer,friends/fledgl entrepreneur invent a devic in ...


In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class MovieRecommender(BaseEstimator, TransformerMixin):
    def __init__(self, max_features=5000, stop_words='english'):
        self.max_features = max_features
        self.stop_words = stop_words
        self.cv = CountVectorizer(max_features=self.max_features, stop_words=self.stop_words)
        self.vector = None
        self.similarity = None
        self.X = None

    def fit(self, X, y=None):
        self.X = X
        self.vector = self.cv.fit_transform(X['tags']).toarray()
        self.similarity = cosine_similarity(self.vector)
        return self

    def predict(self, movie, n=5):
        index = self.X[self.X['title'] == movie].index[0]
        distances = sorted(list(enumerate(self.similarity[index])), reverse=True, key = lambda x: x[1])
        recommended_movies = []
        for i in distances[1:n+1]:
            recommended_movies.append(self.X.iloc[i[0]].title)
        return recommended_movies

In [21]:
recommender = MovieRecommender()
recommender.fit(df_processed)

In [22]:
print(recommender.predict('Spider-Man 2'))

['Transformers', 'Transformers: Revenge of the Fallen', 'Transformers: Dark of the Moon', 'Lucy', 'Shooter']
