In [1]:
import toolbox as tb
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [None]:
movies_df = pd.read_csv("../wth_actors_name/data/main_movie.csv").drop(columns="Unnamed: 0")

In [None]:
movies_df = movies_df.drop(columns=["titleType", "endYear", "originalTitle", "awards", "budget", "isAdult"])

### Dealing with NaN and converting to numerical dtypes

In [None]:
# Dealing with NaN values
movies_df = movies_df[movies_df["numVotes"].notnull()]
movies_df = movies_df[movies_df["runtimeMinutes"] != "\\N"]
movies_df["totalNoms"] = movies_df["totalNoms"].fillna(0)

In [None]:
# Converting to int or float dtypes
movies_df["numVotes"] = movies_df["numVotes"].astype(int)

movies_df["runtimeMinutes"] = movies_df["runtimeMinutes"].astype(int)

movies_df["totalNoms"] = movies_df["totalNoms"].astype(int)


wwg_list = list(movies_df["worldwideGross"])

for i, gross in enumerate(wwg_list):
    
    if type(gross) != float:
        wwg_list[i] = int(gross.replace("$", "").replace(",", ""))
        
movies_df["worldwideGross"] = wwg_list

wwg_median = movies_df["worldwideGross"].median()

movies_df["worldwideGross"].fillna(wwg_median, inplace=True)

movies_df["worldwideGross"] = movies_df["worldwideGross"].astype(int)

movies_df.reset_index(inplace=True)
movies_df.drop(columns="index", inplace=True)

### OneHotEncode all features non-numerical features

In [None]:
"""Converts columns which contain strings of values as elements, into OneHotEncod"""

def create_features_ohe(df, column):
    column_list = list(df[column])
    
    # Split values
    for i, row in enumerate(column_list):
        if type(row) != float:
            column_list[i] = row.split(",")
      
    
    # Create list of unique values
    all_features = []
    for row in column_list:
        
        if type(row) != float:
            for feature in row:
                if feature not in all_features:
                    all_features.append(feature)
            
    all_features.sort()
    
    
    # Create a column for each value in all_features
    for feature in all_features:
        df[f"{column}_{feature.lower()}"] = 0
        
    
    # OneHotEncoding for each row
    for i, row in enumerate(column_list):
        
        if type(row) != float:
            for feature in row:

                df.loc[i, f"{column}_{feature.lower()}"] = 1 

    return

**Features to OneHotEncode:** "genres", "directors", "actors", "writers", "country", "language"

In [None]:
create_features_ohe(movies_df, "genres")

create_features_ohe(movies_df, "directors")

create_features_ohe(movies_df, "writers")

create_features_ohe(movies_df, "actors")

create_features_ohe(movies_df, "country")

create_features_ohe(movies_df, "language")

In [None]:
movies_df = movies_df.drop(columns=["genres", "directors", "writers", "actors", "country", "language"])

In [None]:
movies_df.to_csv("../wth_actors_name/data/ohe_movie.csv")

## Feature Analysis

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.boxplot(movies_df["startYear"])
plt.show()
plt.hist(movies_df["startYear"])
plt.show()

In [None]:
plt.boxplot(movies_df["worldwideGross"])
plt.show()
plt.hist(movies_df["worldwideGross"])
plt.show()

In [None]:
plt.boxplot(movies_df["numVotes"])
plt.show()
plt.hist(movies_df["numVotes"])
plt.show

In [None]:
plt.boxplot(movies_df["runtimeMinutes"])
plt.show()
plt.hist(movies_df["runtimeMinutes"])
plt.show()

### Feature scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

In [None]:
ohe_df = pd.read_csv("../wth_actors_name/data/ohe_movie.csv").drop(columns=["Unnamed: 0"])

In [None]:
scaler = MinMaxScaler().fit(ohe_df[ohe_df.columns[2:]])
ohe_df[ohe_df.columns[2:]] = scaler.transform(ohe_df[ohe_df.columns[2:]])

In [None]:
ohe_df.to_csv("../wth_actors_name/data/ohe_df_scaled.csv")

**Note: Consider using multiple KNNs. You have to separate the datasets into movie-directors-genre, movie-writers-genre, movie-actors-genre and so forth. Maybe some extra features for each datasets.**

**Consider bootstrapping models in order of importance of the features**

### KNN Model (Unsupervised)

In [None]:
ohe_df_scaled = pd.read_csv("../wth_actors_name/data/ohe_df_scaled.csv").drop(columns=["Unnamed: 0", "oscarsNom", "goldenGlobesWon", "goldenGlobesNom"])

In [None]:
X = ohe_df_scaled[ohe_df_scaled.columns[2:]]

In [None]:
nn_model = NearestNeighbors(n_neighbors=8)
nn_model.fit(X)

Movie Inputted:
0 -> The Dark Knight

In [None]:
ohe_df_scaled[ohe_df_scaled["primaryTitle"] == "The Dark Knight" ]

In [None]:
nn_model.kneighbors(X.loc[[0]], 10)

**Recommendations:**<br>
29 -> Batman Begins<br>
18 -> The Dark Knight Rises<br>
418 -> The Prestige<br>
3 -> Inception<br>
85 -> Interstellar<br>
1040 -> Lucky Number Slevin<br>
250 -> Anomalisa<br>
26 -> Paan Singh Tomar<br>
380 -> Se7en

Features order of importance:
- 1. genre
- 1. director
- 1. actors


Try bootstrapping with two models and check how good the results are.

## Bootstrapping models

In [185]:
"""
This function creates an ohe_df from the features that you select from the ohe_df
"""

def select_features(ohe_df, features):
    
    genres = []
    writers = []
    actors = []
    countries = []
    languages = []
    directors = []
    
    columns = list(ohe_df.columns)
    all_features = ["tconst", "primaryTitle"]
    
    if "genres" in features:
        
        for feat in columns:
            if feat.find("genres") != -1:
                genres.append(feat)
            
    
    if "writers" in features:
        
        for feat in columns:
            if feat.find("writers") != -1:
                writers.append(feat)
    
    
    if "actors" in features:
        
        for feat in columns:
            if feat.find("actors") != -1:
                actors.append(feat)
            
            
    if "directors" in features:
        
        for feat in columns:
            if feat.find("directors") != -1:
                directors.append(feat)
    
    
    if "countries" in features:
        
        for feat in columns:
            if feat.find("country") != -1:
                countries.append(feat)
                
                    
    if "languages" in features:
        
        for feat in columns:
            if feat.find("languages") != -1:
                languages.append(feat)
    
    
    if "startYear" in features:
        all_features.append("startYear")
    
    if "runtimeMinutes" in features:
        all_features.append("runtimeMinutes")
        
    if "averageRating" in features:
        all_features.append("averageRating")
        
    if "numVotes" in features:
        all_features.append("numVotes")
        
    if "worldwideGross" in features:
        all_features.append("worldwideGross")

    if "oscarsWon" in features:
        all_features.append("oscarsWon")
        
    if "totalWins" in features:
        all_features.append("totalWins")
        
    if "totalNoms" in features:
        all_features.append("totalNoms")
        
    
    all_features += genres + writers + actors + countries + languages + directors
    
    return ohe_df[all_features]


def print_recommendations(df, model, input_movie, amount):
    
    dist, ind = model.kneighbors(input_movie, amount)

    
    dist = dist.tolist()[0]
    ind = ind.tolist()[0]
        
    for i, el in enumerate(ind):
        print(df.loc[el]["tconst"] + ": " + df.loc[el]["primaryTitle"] + "  :  " + str(dist[i]))
        
    return


def get_movie_id(movieName):
    
    df = pd.read_csv("../wth_actors_name/data/main_movie.csv")
    
    return list(df[df["primaryTitle"] == movieName]["tconst"])[0]

def get_movie_index(df, movieName):
    
    return df[df["primaryTitle"] == movieName].index[0]

**MODEL 1**:<br>
Features: Genres, Directors

In [90]:
ohe_df_scaled = pd.read_csv("../wth_actors_name/data/ohe_df_scaled.csv").drop(columns=["Unnamed: 0", "oscarsNom", "goldenGlobesWon", "goldenGlobesNom"])

In [278]:
movie_name = "Iron Man"

In [279]:
movie_index = get_movie_index(df_1, movie_name)

In [280]:
df_1 = select_features(ohe_df_scaled, ["genres", "directors", "countries"])
X_1 = df_1.drop(columns=["tconst", "primaryTitle"])

In [281]:
model_1 = NearestNeighbors()
model_1.fit(X_1)

NearestNeighbors()

In [282]:
distance, index = model_1.kneighbors(X_1.loc[[movie_index]], X_1.shape[0])

distance = distance.tolist()[0]
index = index.tolist()[0]

chosen_ones = []
last_dist = distance[100]

for i, dist in enumerate(distance):
    
    if dist > last_dist: break
    chosen_ones.append(index[i])
    

In [283]:
len(chosen_ones)

115

In [284]:
print_recommendations(df_1, model_1, X_1.loc[[movie_index]], len(chosen_ones))

tt0371746: Iron Man  :  0.0
tt1228705: Iron Man 2  :  1.0
tt1631867: Edge of Tomorrow  :  1.4142135623730951
tt0290334: X2: X-Men United  :  1.4142135623730951
tt1825683: Black Panther  :  1.7320508075688772
tt0102975: Star Trek VI: The Undiscovered Country  :  1.7320508075688772
tt0084726: Star Trek II: The Wrath of Khan  :  1.7320508075688772
tt0120201: Starship Troopers  :  1.7320508075688772
tt0848228: The Avengers  :  1.7320508075688772
tt3748528: Rogue One: A Star Wars Story  :  1.7320508075688772
tt1408101: Star Trek Into Darkness  :  1.7320508075688772
tt1392170: The Hunger Games  :  1.7320508075688772
tt1877832: X-Men: Days of Future Past  :  1.7320508075688772
tt0120903: X-Men  :  1.7320508075688772
tt0145487: Spider-Man  :  1.7320508075688772
tt0316654: Spider-Man 2  :  1.7320508075688772
tt0437086: Alita: Battle Angel  :  1.7320508075688772
tt0107290: Jurassic Park  :  1.7320508075688772
tt2395427: Avengers: Age of Ultron  :  1.7320508075688772
tt2250912: Spider-Man: Homeco

In [285]:
first_filter = pd.DataFrame()

for movie in chosen_ones:
    
    first_filter = first_filter.append(ohe_df_scaled.loc[[movie]])
    
first_filter.reset_index(inplace=True)

**MODEL 2:**<br>
Features: Writers, startYear, numVotes, averageRating

In [286]:
first_filter.head(8)

Unnamed: 0,index,tconst,primaryTitle,startYear,runtimeMinutes,averageRating,numVotes,worldwideGross,oscarsWon,totalWins,...,language_turkish,language_ukrainian,language_ungwatsi,language_urdu,language_vietnamese,language_wolof,language_xhosa,language_yiddish,language_yoruba,language_zulu
0,81,tt0371746,Iron Man,0.76,0.290909,0.820513,0.400818,0.209377,0.0,0.074324,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,940,tt1228705,Iron Man 2,0.8,0.283636,0.705128,0.308308,0.223008,0.0,0.023649,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,857,tt0290334,X2: X-Men United,0.66,0.32,0.75641,0.217917,0.145725,0.0,0.02027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,77,tt1631867,Edge of Tomorrow,0.88,0.243636,0.820513,0.255819,0.13244,0.0,0.037162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,897,tt0102975,Star Trek VI: The Undiscovered Country,0.42,0.232727,0.730769,0.030027,0.03463,0.0,0.006757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,871,tt0316654,Spider-Man 2,0.68,0.294545,0.74359,0.231871,0.281999,0.090909,0.081081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,875,tt0145487,Spider-Man,0.64,0.272727,0.74359,0.29308,0.294883,0.0,0.054054,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,867,tt1825683,Black Panther,0.96,0.32,0.74359,0.267517,0.481663,0.272727,0.371622,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [287]:
df_2 = select_features(first_filter, ["tconst", "primaryTitle", "genres", "actors", "directors", "writers", "numVotes", "averageRating",])
X_2 = df_2.drop(columns=["tconst", "primaryTitle"])

In [288]:
movie_index = get_movie_index(df_2, movie_name)

In [289]:
model_2 = NearestNeighbors()
model_2.fit(X_2)

NearestNeighbors()

In [290]:
distance, index = model_2.kneighbors(X_2.loc[[movie_index]], X_2.shape[0])

distance = distance.tolist()[0]
index = index.tolist()[0]

In [291]:
chosen_ones = []
last_dist = distance[30]

for i, dist in enumerate(distance):
    
    if dist > last_dist: break
    chosen_ones.append(index[i])
    

In [292]:
len(chosen_ones)

31

In [293]:
print_recommendations(df_2, model_2, X_2.loc[[movie_index]], 30)

tt0371746: Iron Man  :  0.0
tt1228705: Iron Man 2  :  4.361407066084062
tt1300854: Iron Man 3  :  5.100654902860778
tt0848228: The Avengers  :  5.293274316911743
tt2395427: Avengers: Age of Ultron  :  5.386352578044009
tt1825683: Black Panther  :  5.4793873825905814
tt1877832: X-Men: Days of Future Past  :  5.569049032575962
tt0145487: Spider-Man  :  5.569337885249515
tt0379786: Serenity  :  5.5748315385806215
tt2883512: Chef  :  5.5773064920927435
tt1375666: Inception  :  5.5897630743564415
tt0107290: Jurassic Park  :  5.656995776693403
tt2250912: Spider-Man: Homecoming  :  5.660117107571811
tt0116629: Independence Day  :  5.660771853633505
tt0119116: The Fifth Element  :  5.661029983379471
tt0093773: Predator  :  5.662066124644245
tt1677720: Ready Player One  :  5.662296955382175
tt6320628: Spider-Man: Far from Home  :  5.663113422336069
tt0120201: Starship Troopers  :  5.664797141736027
tt0111282: Stargate  :  5.667180930053253
tt0082340: Escape from New York  :  5.668185855818586
t

PROBLEM: Movies with less information can be more similar than movies with more. Movies with more information have more opportunities to be different than movies with little information, where everything that is missing is filled with 0.

A good example of this is: ["tt5317732", "The Little Prince"]

**For better recommendations, we need more movies in the database. In the case of "The Dark Knight", after 5 movies there are already only 2 things in common**