In [49]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from ast import literal_eval
import datetime
import time  # for timing
import joblib # for saving objects


TESTING = True
REGENERATE = False

if TESTING:
    dataset_name = 'netflix'
else:
    dataset_name = input("Enter dataset name: ")

# Search file with levinstein distance

dataset_files = []
for root, dirs, files in os.walk("data"):
    for file in files:
        if file.endswith(".csv"):
            dataset_files.append(file)


def calculate_levenshtein(str1, str2):
    replacementCost = [[]]

    for i in range(len(str1) + 1):
        replacementCost.append([])
        for j in range(len(str2) + 1):
            if i == 0:
                replacementCost[i].append(j)
            elif j == 0:
                replacementCost[i].append(i)
            elif str1[i - 1] == str2[j - 1]:
                replacementCost[i].append(replacementCost[i - 1][j - 1])
            else:
                replacementCost[i].append(1 + min(replacementCost[i - 1][j], replacementCost[i][j - 1],
                                                  replacementCost[i - 1][j - 1]))

    return replacementCost[len(str1)][len(str2)]


# Find the closest file
scores = []

for file in dataset_files:
    print(file)
    scores.append(calculate_levenshtein(dataset_name, file.split("_data")[0]))

closest_file_name = dataset_files[scores.index(min(scores))]
print("Closest file: " + closest_file_name)

# Load the dataset
df = pd.read_csv("data/" + closest_file_name)

df = df.drop(columns=["imdb_id"])

# print(df)
df["score_avg"] = (df["imdb_score"] + df["tmdb_score"]) / 2

hulu-tv_data.csv
hbo-max_data.csv
paramount-tv_data.csv
disney-+_data.csv
crunchyroll_data.csv
rakuten-viki-tv_data.csv
dark-matter-tv_data.csv
netflix_data.csv
amazon-prime_data.csv
Closest file: netflix_data.csv


In [50]:
class Progressbar:

    def __init__(self, maxValue, startTime=time.time()):
        self.startTime = startTime
        self.maxValue = maxValue

    def update(self, index):
        valueLength = len(str(self.maxValue))
        indexString = str(index)
        # while len(str(indexString)) < valueLength:
        #     indexString = " " + indexString

        indexString.rjust(valueLength - len(indexString))

        print("\rProgress: [{0:50s}] {1:.1f}%".format('#' * int((index + 1) * 50 / self.maxValue), (index + 1) * 100 / self.maxValue)
              + "    " + indexString + "/" + str(self.maxValue) + "    " +
              "Time left: {}".format(str(datetime.timedelta(seconds=(self.maxValue - (index + 1)) * (time.time() - self.startTime) / (index + 1)))), end="", flush=True)


In [51]:
unique_genres = []

for index, row in df.iterrows():
    row["genres"] = literal_eval(row["genres"])
    for genre in row["genres"]:
        if genre not in unique_genres:
            unique_genres.append(genre)

df["genres_index"] = df["genres"].apply(lambda x: [unique_genres.index(i) for i in literal_eval(x)])

In [52]:
unique_production_countries = []

for index, row in df.iterrows():
    row["production_countries"] = literal_eval(row["production_countries"])
    for production_country in row["production_countries"]:
        if production_country not in unique_production_countries:
            unique_production_countries.append(production_country)

df["production_countries_index"] = df["production_countries"].apply(
    lambda x: [unique_production_countries.index(i) for i in literal_eval(x)])


In [53]:
unique_types = []

for index, row in df.iterrows():
    if row["type"] not in unique_types:
        unique_types.append(row["type"])

df["type_index"] = df["type"].apply(lambda x: unique_types.index(x))

In [54]:
unique_age_certifications = []

for index, row in df.iterrows():
    if row["age_certification"] not in unique_age_certifications:
        unique_age_certifications.append(row["age_certification"])

df["age_certification_index"] = df["age_certification"].apply(
    lambda x: unique_age_certifications.index(x))

In [55]:
df_splitted = pd.DataFrame(columns=df.columns)
df_splitted.drop(columns=["genres", "production_countries", "type", "age_certification"], inplace=True)

progressBar = Progressbar(len(df))

save_name = "saved/splitted_data_" + \
    dataset_name.split(".")[0].split("_")[0] + ".df"

if REGENERATE or not os.path.exists(save_name):
    for index, row in df.iterrows():

        progressBar.update(index)

        for genre_index in row["genres_index"]:
            row_copy = row.copy()
            row_copy["genres_index"] = genre_index

            for production_country_index in row_copy["production_countries_index"]:
                row_copy2 = row_copy.copy()
                row_copy2["production_countries_index"] = production_country_index        

                # df_splitted = df_splitted.append(pd.DataFrame(data=row_copy2))
                # df_splitted = pd.concat([df_splitted, pd.DataFrame(data=row_copy2)])

                df_splitted.loc[len(df_splitted)] = row_copy2

            # df_splitted = pd.concat([df_splitted, pd.DataFrame(data=row_copy)])
            
    joblib.dump(df_splitted, save_name)
else:
    df_splitted = joblib.load(save_name)

df_splitted

Unnamed: 0,id,title,description,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,score_avg,genres_index,production_countries_index,type_index,age_certification_index
0,ts300399,Five Came Back: The Reference Films,This collection includes 12 World War II-era p...,1945,48,1.0,,,0.6,,,0,0,0,0
1,tm84618,Taxi Driver,A mentally unstable Vietnam War veteran works ...,1976,113,,8.3,795222.0,27.612,8.2,8.25,1,0,1,1
2,tm84618,Taxi Driver,A mentally unstable Vietnam War veteran works ...,1976,113,,8.3,795222.0,27.612,8.2,8.25,2,0,1,1
3,tm127384,Monty Python and the Holy Grail,"King Arthur, accompanied by his squire, recrui...",1975,91,,8.2,530877.0,18.216,7.8,8.0,3,1,1,2
4,tm127384,Monty Python and the Holy Grail,"King Arthur, accompanied by his squire, recrui...",1975,91,,8.2,530877.0,18.216,7.8,8.0,4,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16845,tm1014599,Fine Wine,A beautiful love story that can happen between...,2021,100,,6.9,39.0,0.966,,,2,44,1,6
16846,tm1045018,Clash,A man from Nigeria returns to his family in Ca...,2021,88,,6.5,32.0,0.709,,,11,44,1,6
16847,tm1045018,Clash,A man from Nigeria returns to his family in Ca...,2021,88,,6.5,32.0,0.709,,,11,9,1,6
16848,tm1045018,Clash,A man from Nigeria returns to his family in Ca...,2021,88,,6.5,32.0,0.709,,,2,44,1,6


In [56]:
# display the top 3 movies with the highest Imdb_score
df.sort_values(by=["imdb_score"], ascending=False, inplace=True)

# visualize the data
df.head(3)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,score_avg,genres_index,production_countries_index,type_index,age_certification_index
656,ts160526,Khawatir,SHOW,A TV show devoted to help young people to be m...,2005,TV-14,20,['reality'],[],11.0,9.6,3046.0,,,,[17],[],0,3
2927,ts265844,#ABtalks,SHOW,#ABtalks is a YouTube interview show hosted by...,2018,TV-PG,68,[],[],1.0,9.6,7.0,,,,[],[],0,7
243,ts4,Breaking Bad,SHOW,"When Walter White, a New Mexico chemistry teac...",2008,TV-MA,48,"['drama', 'thriller', 'crime']",['US'],5.0,9.5,1727694.0,337.419,8.8,9.15,"[2, 7, 1]",[0],0,0


In [59]:
df_splitted_noNull= df_splitted.dropna() 

# Split in train and test set
X = df_splitted_noNull.drop(columns=['id', 'score_avg', 'title', 'description'])
y = df_splitted_noNull['score_avg']
y = y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

print(df_splitted_noNull.columns)

# generate new movie example with data  to predict
new_movie = pd.DataFrame(columns=df_splitted_noNull.columns)





#newData = 

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

score = accuracy_score(y_test, predictions)
score


Index(['id', 'title', 'description', 'release_year', 'runtime', 'seasons',
       'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score',
       'score_avg', 'genres_index', 'production_countries_index', 'type_index',
       'age_certification_index'],
      dtype='object')


0.9888059701492538