In [1]:
# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd


# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Feature engine
from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
)

# Sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Custom imports

# Built-in library
import itertools
import re
import json
from typing import Union, Optional, Any
import logging
import warnings

# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
# Configure warnings and pther settings
warnings.filterwarnings("ignore")
sns.set()


def load_data(*, filename: str) -> pd.DataFrame:
    """This is used to load the data.

    Params;
        filename (str): The filepath.

    Returns:
        df (pd.DataFrame): The loaded dataframe.
    """
    df = pd.read_csv(filename)
    print(f"Shape of df: {df.shape}\n")
    return df

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [5]:
filename = "../../data/tmdb_5000_movies.csv"
movie_db = load_data(filename=filename)

movie_db.head(2)

Shape of df: (4803, 20)



Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 878, ""name"": ""Science Fiction""}]",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""space war""}, {""id"": 3388, ""name"": ""space colony""}, {""id"": 3679, ""name"": ""society""}, {""id"": 3801, ""name"": ""space travel""}, {""id"": 9685, ""name"": ""futuristic""}, {""id"": 9840, ""name"": ""romance""}, {""id"": 9882, ""name"": ""space""}, {""id"": 9951, ""name"": ""alien""}, {""id"": 10148, ""name"": ""tribe""}, {""id"": 10158, ""name"": ""alien planet""}, {""id"": 10987, ""name"": ""cgi""}, {""id"": 11399, ""name"": ""marine""}, {""id"": 13065, ""name"": ""soldier""}, {""id"": 14643, ""name"": ""battle""}, {""id"": 14720, ""name"": ""love affair""}, {""id"": 165...",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"": ""Twentieth Century Fox Film Corporation"", ""id"": 306}, {""name"": ""Dune Entertainment"", ""id"": 444}, {""name"": ""Lightstorm Entertainment"", ""id"": 574}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}, {""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}]",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic island""}, {""id"": 1319, ""name"": ""east india trading company""}, {""id"": 2038, ""name"": ""love of one's life""}, {""id"": 2052, ""name"": ""traitor""}, {""id"": 2580, ""name"": ""shipwreck""}, {""id"": 2660, ""name"": ""strong woman""}, {""id"": 3799, ""name"": ""ship""}, {""id"": 5740, ""name"": ""alliance""}, {""id"": 5941, ""name"": ""calypso""}, {""id"": 6155, ""name"": ""afterlife""}, {""id"": 6211, ""name"": ""fighter""}, {""id"": 12988, ""name"": ""pirate""}, {""id"": 157186, ""name"": ""swashbuckler""}, {""id"": 179430, ""name"": ""aftercreditsstinger""}]",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems.",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Jerry Bruckheimer Films"", ""id"": 130}, {""name"": ""Second Mate Productions"", ""id"": 19936}]","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [9]:
def extract_values(input_: str) -> str:
    """This returns the extracted values as a string from a list of dict.

    Params:
        input_ (str): A string containing a list of key-value pairs.

    Returns:
        result (str): A string containing the extracted values.
    """
    input_ = json.loads(input_)
    extracted_values = []
    for row in input_:
        val = row.get("name")
        if len(val.split()) == 1:
            extracted_values.append(val)
            # if the val is a 2-word genre i.e "Science Fiction"
        elif len(val.split()) > 1:
            extracted_values.append("".join(val.split()))
    result = " ".join(extracted_values)
    return result


def concat_columns(col_a: str, col_b: str) -> str:
    """This returns a concatenated string.

    Params:
        col_a (str): The 1st column.
        col_b (str): The 2nd column.

    Returns:
        result (str): The concatenated string.
    """
    result = f"{col_a} {col_b}"
    return result

In [7]:
movie_db.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [8]:
columns = ["genres", "keywords", "title"]
train_df = movie_db[columns]
train_df.head(2)

Unnamed: 0,genres,keywords,title
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 878, ""name"": ""Science Fiction""}]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""space war""}, {""id"": 3388, ""name"": ""space colony""}, {""id"": 3679, ""name"": ""society""}, {""id"": 3801, ""name"": ""space travel""}, {""id"": 9685, ""name"": ""futuristic""}, {""id"": 9840, ""name"": ""romance""}, {""id"": 9882, ""name"": ""space""}, {""id"": 9951, ""name"": ""alien""}, {""id"": 10148, ""name"": ""tribe""}, {""id"": 10158, ""name"": ""alien planet""}, {""id"": 10987, ""name"": ""cgi""}, {""id"": 11399, ""name"": ""marine""}, {""id"": 13065, ""name"": ""soldier""}, {""id"": 14643, ""name"": ""battle""}, {""id"": 14720, ""name"": ""love affair""}, {""id"": 165...",Avatar
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic island""}, {""id"": 1319, ""name"": ""east india trading company""}, {""id"": 2038, ""name"": ""love of one's life""}, {""id"": 2052, ""name"": ""traitor""}, {""id"": 2580, ""name"": ""shipwreck""}, {""id"": 2660, ""name"": ""strong woman""}, {""id"": 3799, ""name"": ""ship""}, {""id"": 5740, ""name"": ""alliance""}, {""id"": 5941, ""name"": ""calypso""}, {""id"": 6155, ""name"": ""afterlife""}, {""id"": 6211, ""name"": ""fighter""}, {""id"": 12988, ""name"": ""pirate""}, {""id"": 157186, ""name"": ""swashbuckler""}, {""id"": 179430, ""name"": ""aftercreditsstinger""}]",Pirates of the Caribbean: At World's End


In [17]:
# Preprocess data
train_df = train_df.assign(
    genres=lambda x: x["genres"].apply(extract_values),
    keywords=lambda x: x["keywords"].apply(extract_values),
)

train_df.head(2)

Unnamed: 0,genres,keywords,title
0,Action Adventure Fantasy ScienceFiction,cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d,Avatar
1,Adventure Fantasy Action,ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger,Pirates of the Caribbean: At World's End


In [25]:
train_df = train_df.assign(
    preprocessed_feature=train_df.apply(
        lambda x: concat_columns(x["genres"], x["keywords"]), axis="columns"
    )
)

train_df.head(2)

Unnamed: 0,genres,keywords,title,preprocessed_feature
0,Action Adventure Fantasy ScienceFiction,cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d,Avatar,Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d
1,Adventure Fantasy Action,ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger,Pirates of the Caribbean: At World's End,Adventure Fantasy Action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger


In [28]:
tfidf = TfidfVectorizer(stop_words="english", max_features=4_000)
X_tr = tfidf.fit_transform(train_df["preprocessed_feature"])

X_tr.shape

(4803, 4000)

In [34]:
train_df["title"].sample(n=5, random_state=4)

3857                  Wicked Blood
3740    The Greatest Show on Earth
2329                    Prom Night
2222        Code Name: The Cleaner
2114          Return to Never Land
Name: title, dtype: object

In [37]:
# Create a Pandas Series containing the movie titles
# as index and the DataFrame index as reference
index_series = pd.Series(train_df.index, index=train_df["title"])
index_series.head()

title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
dtype: int64

In [44]:
idx = index_series.loc["John Carter"]
idx




In [75]:
X_tr[idx].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [81]:
def preprocess_data(*, db: pd.DataFrame) -> pd.DataFrame:
    """This is used to prepare the data."""
    db = db.copy()
    columns = ["genres", "keywords", "title"]
    db = db[columns]  # Select relevant columns
    # Preprocess data
    db = db.assign(
        genres=lambda x: x["genres"].apply(extract_values),
        keywords=lambda x: x["keywords"].apply(extract_values),
    )

    db = db.assign(
        preprocessed_feature=db.apply(
            lambda x: concat_columns(x["genres"], x["keywords"]), axis="columns"
        )
    )

    return db


def get_tfidf(*, cleaned_data: pd.DataFrame) -> np.ndarray:
    """This calculates the tf-idf of the data."""
    feature = cleaned_data.columns[-1]
    tfidf = TfidfVectorizer(stop_words="english", max_features=4_000)
    X_tr = tfidf.fit_transform(cleaned_data[feature])
    return X_tr


def get_movie_index_series(*, db: pd.DataFrame) -> pd.Series:
    """This resturns a Pandas Series containing the index of the movies
    and the movie titles."""
    index_series = pd.Series(db.index, index=db["title"])
    return index_series


def select_n_movies(
    *, index_series: pd.Series, similarity: np.ndarray, n: int = 5
) -> list:
    """This selects the `n` top movies."""
    # Select the movies and convert to a list
    result = index_series.iloc[(-similarity).argsort()[1 : n + 1]].index
    result = list(result)
    return result

In [82]:
def movie_recommender(*, db: pd.DataFrame, movie_title: str) -> list:
    """This returns a list of 5 most similar movies."""
    processed_data = preprocess_data(db=db)
    X_tfidf = get_tfidf(cleaned_data=processed_data)
    # Obtain Series containing movie index and movie titles
    index_series = get_movie_index_series(db=processed_data)
    query_idx = index_series.loc[movie_title]  # Index of the queried movie
    # Calculate TFIDF of queried movie
    query_tfidf = X_tfidf[query_idx]
    # Calculate similarity
    similarity = cosine_similarity(X=query_tfidf, Y=X_tfidf)
    # Flatten the similarity (i.e convert to 1-D)
    similarity = similarity.flatten()
    rec_movies = select_n_movies(index_series=index_series, similarity=similarity, n=5)
    return {"recommended_movies": rec_movies}

In [48]:
query_tfidf = X_tr[idx]
query_tfidf

<1x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [54]:
# Calculate the similarity
similarity = cosine_similarity(X=query_tfidf, Y=X_tr)

# Flatten the similarity (i.e convert to 1-D)
similarity = similarity.flatten()
similarity

array([0.1508818 , 0.02030212, 0.07185168, ..., 0.        , 0.        ,
       0.        ])

In [59]:
# Sort the similarity based using the index of the values. 
# Add a -ve sign which is used to sort in descending order.
(-similarity).argsort()

array([   4, 3904,  752, ..., 2378, 2364, 4802])

In [79]:
index_series.iloc[(-similarity).argsort()[1:6]]

title
Spaced Invaders          3904
My Favorite Martian       752
The Last Days on Mars    2964
Guiana 1838              4384
Mission to Mars           373
dtype: int64

In [84]:
def movie_recommender(*, db: pd.DataFrame, movie_title: str) -> list:
    """This returns a list of 5 most similar movies."""
    processed_data = preprocess_data(db=db)
    X_tfidf = get_tfidf(cleaned_data=processed_data)
    # Obtain Series containing movie index and movie titles
    index_series = get_movie_index_series(db=processed_data)
    query_idx = index_series.loc[movie_title]  # Index of the queried movie
    # Calculate TFIDF of queried movie
    query_tfidf = X_tfidf[query_idx]
    # Calculate similarity
    similarity = cosine_similarity(X=query_tfidf, Y=X_tfidf)
    # Flatten the similarity (i.e convert to 1-D)
    similarity = similarity.flatten()
    rec_movies = select_n_movies(index_series=index_series, similarity=similarity, n=5)
    return {"recommended_movies": rec_movies}

In [85]:
movie_title = "John Carter"
movie_recommender(db=movie_db, movie_title=movie_title)

{'recommended_movies': ['Spaced Invaders',
  'My Favorite Martian',
  'The Last Days on Mars',
  'Guiana 1838',
  'Mission to Mars']}