In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from ast import literal_eval
import datetime
import time  # for timing
import joblib # for saving objects
import random # for random number generation


TESTING = False
REGENERATE = False

if TESTING:
    dataset_name = 'netflix'
else:
    dataset_name = input("Enter dataset name: ")

# Search file with levinstein distance

dataset_files = []
for root, dirs, files in os.walk("data"):
    for file in files:
        if file.endswith(".csv"):
            dataset_files.append(file)


def calculate_levenshtein(str1, str2):
    replacementCost = [[]]

    for i in range(len(str1) + 1):
        replacementCost.append([])
        for j in range(len(str2) + 1):
            if i == 0:
                replacementCost[i].append(j)
            elif j == 0:
                replacementCost[i].append(i)
            elif str1[i - 1] == str2[j - 1]:
                replacementCost[i].append(replacementCost[i - 1][j - 1])
            else:
                replacementCost[i].append(1 + min(replacementCost[i - 1][j], replacementCost[i][j - 1],
                                                  replacementCost[i - 1][j - 1]))

    return replacementCost[len(str1)][len(str2)]


# Find the closest file
scores = []

for file in dataset_files:
    print(file)
    scores.append(calculate_levenshtein(dataset_name, file.split("_data")[0]))

closest_file_name = dataset_files[scores.index(min(scores))]
print("Closest file: " + closest_file_name)

# Load the dataset
df = pd.read_csv("data/" + closest_file_name)

df = df.drop(columns=["imdb_id"])

# print(df)
df["score_avg"] = (df["imdb_score"] + df["tmdb_score"]) / 2

In [None]:
class Progressbar:

    def __init__(self, maxValue, startTime=time.time()):
        self.startTime = startTime
        self.maxValue = maxValue

    def update(self, index):
        valueLength = len(str(self.maxValue))
        indexString = str(index)
        # while len(str(indexString)) < valueLength:
        #     indexString = " " + indexString

        indexString.rjust(valueLength - len(indexString))

        print("\rProgress: [{0:50s}] {1:.1f}%".format('#' * int((index + 1) * 50 / self.maxValue), (index + 1) * 100 / self.maxValue)
              + "    " + indexString + "/" + str(self.maxValue) + "    " +
              "Time left: {}".format(str(datetime.timedelta(seconds=(self.maxValue - (index + 1)) * (time.time() - self.startTime) / (index + 1)))), end="", flush=True)


In [None]:
unique_genres = []

for index, row in df.iterrows():
    row["genres"] = literal_eval(row["genres"])
    for genre in row["genres"]:
        if genre not in unique_genres:
            unique_genres.append(genre)

df["genres_index"] = df["genres"].apply(lambda x: [unique_genres.index(i) for i in literal_eval(x)])

print(unique_genres)


In [None]:
unique_production_countries = []

for index, row in df.iterrows():
    row["production_countries"] = literal_eval(row["production_countries"])
    for production_country in row["production_countries"]:
        if production_country not in unique_production_countries:
            unique_production_countries.append(production_country)

df["production_countries_index"] = df["production_countries"].apply(
    lambda x: [unique_production_countries.index(i) for i in literal_eval(x)])

print(unique_production_countries)


In [None]:
unique_types = []

for index, row in df.iterrows():
    if row["type"] not in unique_types:
        unique_types.append(row["type"])

df["type_index"] = df["type"].apply(lambda x: unique_types.index(x))

print(unique_types)


In [None]:
unique_age_certifications = []

for index, row in df.iterrows():
    if row["age_certification"] not in unique_age_certifications:
        unique_age_certifications.append(row["age_certification"])

df["age_certification_index"] = df["age_certification"].apply(
    lambda x: unique_age_certifications.index(x))

print(unique_age_certifications)

In [None]:
df_splitted = pd.DataFrame(columns=df.columns)
df_splitted.drop(columns=["genres", "production_countries", "type", "age_certification"], inplace=True)

progressBar = Progressbar(len(df))

save_name = "saved/splitted_data_" + \
    closest_file_name.split(".")[0].split("_")[0] + ".df"

if REGENERATE or not os.path.exists(save_name):
    for index, row in df.iterrows():

        progressBar.update(index)

        for genre_index in row["genres_index"]:
            row_copy = row.copy()
            row_copy["genres_index"] = genre_index

            for production_country_index in row_copy["production_countries_index"]:
                row_copy2 = row_copy.copy()
                row_copy2["production_countries_index"] = production_country_index        

                df_splitted.loc[len(df_splitted)] = row_copy2
            
    joblib.dump(df_splitted, save_name)
else:
    df_splitted = joblib.load(save_name)

df_splitted

In [None]:
# display the top 3 movies with the highest Imdb_score
df.sort_values(by=["imdb_score"], ascending=False, inplace=True)

# visualize the data
df.head(3)

## Scores by Type

In [None]:
sns.scatterplot(x="imdb_score", y="tmdb_score", data=df, hue="type_index")

for index, value in enumerate(unique_types):
    print(f"[{index}] => {value}")

## Prediction

In [None]:
df_splitted_noNull= df_splitted.dropna() 

# Split in train and test set
X = df_splitted_noNull.drop(columns=['id', 'score_avg', 'title', 'description'])
y = df_splitted_noNull['score_avg']
y = y * 100 
y = y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

print(df_splitted_noNull[:1])

# generate new movie example with data  to predict
new_movie = X[:1].copy()

# generate new dummy data
new_movie["genres_index"] = 8
new_movie["production_countries_index"] = 0
new_movie["type_index"] = 1
new_movie["age_certification_index"] = 2
new_movie["release_year"] = 2004
new_movie["runtime"] = 120
new_movie["seasons"] = 1

model = DecisionTreeClassifier()
model.fit(X_train, y_train)



### Accuracy

In [None]:
print(new_movie)

predictions = model.predict(new_movie)
print(predictions / 100)

test_predictions = model.predict(X_test)

score = accuracy_score(y_test, test_predictions)
print(score)


In [None]:
# visualize genres and score_avg using sns boxplot
sns.boxplot(x="genres_index", y="score_avg", data=df_splitted_noNull)
plt.gcf().set_size_inches(30, 10)
plt.show()

for i in range(0, len(unique_genres)):
    print(i.__str__() + " -> " + unique_genres[i])


In [None]:
# visualize production_countries and score_avg using sns boxplot with production_countries_index limit 5

sns.boxplot(x="production_countries_index", y="score_avg",
            data=df_splitted_noNull, palette="Set1")

# limit x
# plt.xlim(-1, 6)

# changed plot size
plt.gcf().set_size_inches(30, 10)
plt.show()


for i in range(0, len(unique_production_countries)):
    print(i.__str__() + " -> " + unique_production_countries[i])
