In [None]:
import toolbox as tb
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [None]:
movies_df = pd.read_csv("../wth_actors_name/data/main_movie.csv").drop(columns="Unnamed: 0")

In [None]:
movies_df = movies_df.drop(columns=["titleType", "endYear", "originalTitle", "awards", "budget", "isAdult"])

### Dealing with NaN and converting to numerical dtypes

In [None]:
# Dealing with NaN values
movies_df = movies_df[movies_df["numVotes"].notnull()]
movies_df = movies_df[movies_df["runtimeMinutes"] != "\\N"]
movies_df["totalNoms"] = movies_df["totalNoms"].fillna(0)

In [None]:
# Converting to int or float dtypes
movies_df["numVotes"] = movies_df["numVotes"].astype(int)

movies_df["runtimeMinutes"] = movies_df["runtimeMinutes"].astype(int)

movies_df["totalNoms"] = movies_df["totalNoms"].astype(int)


wwg_list = list(movies_df["worldwideGross"])

for i, gross in enumerate(wwg_list):
    
    if type(gross) != float:
        wwg_list[i] = int(gross.replace("$", "").replace(",", ""))
        
movies_df["worldwideGross"] = wwg_list

wwg_median = movies_df["worldwideGross"].median()

movies_df["worldwideGross"].fillna(wwg_median, inplace=True)

movies_df["worldwideGross"] = movies_df["worldwideGross"].astype(int)

movies_df.reset_index(inplace=True)
movies_df.drop(columns="index", inplace=True)

### OneHotEncode all features non-numerical features

In [None]:
"""Converts columns which contain strings of values as elements, into OneHotEncod"""

def create_features_ohe(df, column):
    column_list = list(df[column])
    
    # Split values
    for i, row in enumerate(column_list):
        if type(row) != float:
            column_list[i] = row.split(",")
      
    
    # Create list of unique values
    all_features = []
    for row in column_list:
        
        if type(row) != float:
            for feature in row:
                if feature not in all_features:
                    all_features.append(feature)
            
    all_features.sort()
    
    
    # Create a column for each value in all_features
    for feature in all_features:
        df[f"{column}_{feature.lower()}"] = 0
        
    
    # OneHotEncoding for each row
    for i, row in enumerate(column_list):
        
        if type(row) != float:
            for feature in row:

                df.loc[i, f"{column}_{feature.lower()}"] = 1 

    return

**Features to OneHotEncode:** "genres", "directors", "actors", "writers", "country", "language"

In [None]:
create_features_ohe(movies_df, "genres")

create_features_ohe(movies_df, "directors")

create_features_ohe(movies_df, "writers")

create_features_ohe(movies_df, "actors")

create_features_ohe(movies_df, "country")

create_features_ohe(movies_df, "language")

In [None]:
movies_df = movies_df.drop(columns=["genres", "directors", "writers", "actors", "country", "language"])

In [None]:
movies_df.to_csv("../wth_actors_name/data/ohe_movie.csv")

## Feature Analysis

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.boxplot(movies_df["startYear"])
plt.show()
plt.hist(movies_df["startYear"])
plt.show()

In [None]:
plt.boxplot(movies_df["worldwideGross"])
plt.show()
plt.hist(movies_df["worldwideGross"])
plt.show()

In [None]:
plt.boxplot(movies_df["numVotes"])
plt.show()
plt.hist(movies_df["numVotes"])
plt.show

In [None]:
plt.boxplot(movies_df["runtimeMinutes"])
plt.show()
plt.hist(movies_df["runtimeMinutes"])
plt.show()

### Feature scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

In [None]:
ohe_df = pd.read_csv("../wth_actors_name/data/ohe_movie.csv").drop(columns=["Unnamed: 0"])

In [None]:
scaler = MinMaxScaler().fit(ohe_df[ohe_df.columns[2:]])
ohe_df[ohe_df.columns[2:]] = scaler.transform(ohe_df[ohe_df.columns[2:]])

In [None]:
ohe_df.to_csv("../wth_actors_name/data/ohe_df_scaled.csv")

**Note: Consider using multiple KNNs. You have to separate the datasets into movie-directors-genre, movie-writers-genre, movie-actors-genre and so forth. Maybe some extra features for each datasets.**

**Consider bootstrapping models in order of importance of the features**

### KNN Model (Unsupervised)

In [None]:
ohe_df_scaled = pd.read_csv("../wth_actors_name/data/ohe_df_scaled.csv").drop(columns=["Unnamed: 0", "oscarsNom", "goldenGlobesWon", "goldenGlobesNom"])

In [None]:
X = ohe_df_scaled[ohe_df_scaled.columns[2:]]

In [None]:
nn_model = NearestNeighbors(n_neighbors=8)
nn_model.fit(X)

Movie Inputted:
0 -> The Dark Knight

In [None]:
ohe_df_scaled[ohe_df_scaled["primaryTitle"] == "The Dark Knight" ]

In [None]:
nn_model.kneighbors(X.loc[[0]], 10)

**Recommendations:**<br>
29 -> Batman Begins<br>
18 -> The Dark Knight Rises<br>
418 -> The Prestige<br>
3 -> Inception<br>
85 -> Interstellar<br>
1040 -> Lucky Number Slevin<br>
250 -> Anomalisa<br>
26 -> Paan Singh Tomar<br>
380 -> Se7en

Features order of importance:
- 1. genre
- 1. director
- 1. actors


Try bootstrapping with two models and check how good the results are.

## Bootstrapping models

In [2]:
"""
This function creates an ohe_df from the features that you select from the ohe_df
"""

def select_features(ohe_df, features):
    
    genres = []
    writers = []
    actors = []
    countries = []
    languages = []
    directors = []
    
    columns = list(ohe_df.columns)
    all_features = ["tconst", "primaryTitle"]
    
    if "genres" in features:
        
        for feat in columns:
            if feat.find("genres") != -1:
                genres.append(feat)
            
    
    if "writers" in features:
        
        for feat in columns:
            if feat.find("writers") != -1:
                writers.append(feat)
    
    
    if "actors" in features:
        
        for feat in columns:
            if feat.find("actors") != -1:
                actors.append(feat)
            
            
    if "directors" in features:
        
        for feat in columns:
            if feat.find("directors") != -1:
                directors.append(feat)
    
    
    if "countries" in features:
        
        for feat in columns:
            if feat.find("country") != -1:
                countries.append(feat)
                
                    
    if "languages" in features:
        
        for feat in columns:
            if feat.find("languages") != -1:
                languages.append(feat)
    
    
    if "startYear" in features:
        all_features.append("startYear")
    
    if "runtimeMinutes" in features:
        all_features.append("runtimeMinutes")
        
    if "averageRating" in features:
        all_features.append("averageRating")
        
    if "numVotes" in features:
        all_features.append("numVotes")
        
    if "worldwideGross" in features:
        all_features.append("worldwideGross")

    if "oscarsWon" in features:
        all_features.append("oscarsWon")
        
    if "totalWins" in features:
        all_features.append("totalWins")
        
    if "totalNoms" in features:
        all_features.append("totalNoms")
        
    
    all_features += genres + writers + actors + countries + languages + directors
    
    return ohe_df[all_features]


def print_recommendations(df, model, input_movie, amount):
    
    dist, ind = model.kneighbors(input_movie, amount)

    
    dist = dist.tolist()[0]
    ind = ind.tolist()[0]
        
    for i, el in enumerate(ind):
        print(df.loc[el]["tconst"] + ": " + df.loc[el]["primaryTitle"] + "  :  " + str(dist[i]))
        
    return


def get_movie_id(movieName):
    
    df = pd.read_csv("../wth_actors_name/data/main_movie.csv")
    
    return list(df[df["primaryTitle"] == movieName]["tconst"])[0]

def get_movie_index(df, movieName):
    
    return df[df["primaryTitle"] == movieName].index[0]

**MODEL 1**:<br>
Features: Genres, Directors

In [8]:
ohe_df_scaled = pd.read_csv("../wth_actors_name/data/ohe_movie_scaled.csv").drop(columns=["Unnamed: 0", "oscarsNom", "goldenGlobesWon", "goldenGlobesNom"])

In [9]:
movie_name = "Pulp Fiction"

In [10]:
df_1 = select_features(ohe_df_scaled, ["genres", "directors", "countries"])
X_1 = df_1.drop(columns=["tconst", "primaryTitle"])

In [11]:
movie_index = get_movie_index(df_1, movie_name)

In [12]:
model_1 = NearestNeighbors()
model_1.fit(X_1)

NearestNeighbors()

In [13]:
distance, index = model_1.kneighbors(X_1.loc[[movie_index]], X_1.shape[0])

distance = distance.tolist()[0]
index = index.tolist()[0]

chosen_ones = []
last_dist = distance[100]

for i, dist in enumerate(distance):
    
    if dist > last_dist: break
    chosen_ones.append(index[i])
    

In [14]:
len(chosen_ones)

132

In [15]:
print_recommendations(df_1, model_1, X_1.loc[[movie_index]], len(chosen_ones))

tt0110912: Pulp Fiction  :  0.0
tt0105236: Reservoir Dogs  :  1.0
tt3460252: The Hateful Eight  :  1.0
tt0086250: Scarface  :  1.4142135623730951
tt0071562: The Godfather: Part II  :  1.4142135623730951
tt1853728: Django Unchained  :  1.4142135623730951
tt0266697: Kill Bill: Vol. 1  :  1.4142135623730951
tt0075314: Taxi Driver  :  1.4142135623730951
tt0106469: Blood In, Blood Out  :  1.4142135623730951
tt0068646: The Godfather  :  1.4142135623730951
tt0117381: Primal Fear  :  1.7320508075688772
tt0085794: The King of Comedy  :  1.7320508075688772
tt0405296: A Scanner Darkly  :  1.7320508075688772
tt0114558: Strange Days  :  1.7320508075688772
tt0410097: Hustle & Flow  :  1.7320508075688772
tt0469494: There Will Be Blood  :  1.7320508075688772
tt1990314: Robot & Frank  :  1.7320508075688772
tt0094226: The Untouchables  :  1.7320508075688772
tt0106519: Carlito's Way  :  1.7320508075688772
tt0186151: Frequency  :  1.7320508075688772
tt0443706: Zodiac  :  1.7320508075688772
tt0169547: Amer

In [16]:
first_filter = pd.DataFrame()

for movie in chosen_ones:
    
    first_filter = first_filter.append(ohe_df_scaled.loc[[movie]])
    
first_filter.reset_index(inplace=True)

**MODEL 2:**<br>
Features: Writers, startYear, numVotes, averageRating

In [17]:
first_filter.head(8)

Unnamed: 0,index,tconst,primaryTitle,startYear,runtimeMinutes,averageRating,numVotes,worldwideGross,oscarsWon,totalWins,...,language_turkish,language_ukrainian,language_ungwatsi,language_urdu,language_vietnamese,language_wolof,language_xhosa,language_yiddish,language_yoruba,language_zulu
0,377,tt0110912,Pulp Fiction,0.48,0.392727,0.948718,0.779396,0.076463,0.090909,0.233108,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,388,tt0105236,Reservoir Dogs,0.44,0.192727,0.871795,0.391865,0.001033,0.0,0.040541,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,729,tt3460252,The Hateful Eight,0.9,0.443636,0.807692,0.221118,0.05592,0.090909,0.138514,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,409,tt0106469,"Blood In, Blood Out",0.46,0.487273,0.833333,0.01228,0.001607,0.0,0.003378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,46,tt0266697,Kill Bill: Vol. 1,0.66,0.236364,0.846154,0.426775,0.064658,0.0,0.097973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,397,tt0075314,Taxi Driver,0.12,0.247273,0.858974,0.309559,0.010165,0.0,0.070946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,422,tt1853728,Django Unchained,0.84,0.432727,0.884615,0.57993,0.152036,0.181818,0.189189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,376,tt0071562,The Godfather: Part II,0.08,0.567273,0.961538,0.482317,0.017169,0.545455,0.037162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_2 = select_features(first_filter, ["tconst", "primaryTitle", "genres", "actors", "directors", "writers", "numVotes", "averageRating"])
X_2 = df_2.drop(columns=["tconst", "primaryTitle"])

In [19]:
movie_index = get_movie_index(df_2, movie_name)

In [20]:
model_2 = NearestNeighbors()
model_2.fit(X_2)

NearestNeighbors()

In [84]:
distance, index = model_2.kneighbors(X_2.loc[[movie_index]], X_2.shape[0])

distance = distance.tolist()[0]
index = index.tolist()[0]

In [85]:
chosen_ones = []
last_dist = distance[30]

for i, dist in enumerate(distance):
    
    if dist > last_dist: break
    chosen_ones.append(index[i])
    

In [86]:
len(chosen_ones)

31

In [87]:
print_recommendations(df_2, model_2, X_2.loc[[movie_index]], 10)

tt0468569: The Dark Knight  :  0.0
tt1345836: The Dark Knight Rises  :  3.8883117481044263
tt0372784: Batman Begins  :  4.023822501622813
tt0482571: The Prestige  :  4.819690888598056
tt1375666: Inception  :  5.001068597383764
tt0110413: Léon: The Professional  :  5.02957430813033
tt5013056: Dunkirk  :  5.0575494639552865
tt0425210: Lucky Number Slevin  :  5.075385300247714
tt0816692: Interstellar  :  5.110322105477069
tt0114369: Se7en  :  5.209247369959478


PROBLEM: Movies with less information can be more similar than movies with more. Movies with more information have more opportunities to be different than movies with little information, where everything that is missing is filled with 0.

A good example of this is: ["tt5317732", "The Little Prince"]

**For better recommendations, we need more movies in the database. In the case of "The Dark Knight", after 5 movies there are already only 2 things in common**