In [100]:
import json
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [101]:
model_pred_df = pd.read_csv('prediction_model_01.csv')

In [102]:
data = []
with open('imdb_movies_2000to2022.prolific.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))
df = pd.DataFrame(data)

In [103]:
df.replace({'\\N':np.nan}, inplace = True)
def get_avg_rating(x):
    if isinstance(x, dict):
        return x.get('avg', np.nan)
    return np.nan
df['rating.avg'] = df['rating'].apply(get_avg_rating)

In [104]:
df['rating.avg'].fillna(df['rating.avg'].mean(), inplace = True)
df['runtime'] = pd.to_numeric(df['runtime'])
df['runtime'].fillna(df['runtime'].median(), inplace = True)
df['year'] = pd.to_numeric(df['year'])
df['year'].fillna(df['year'].median(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating.avg'].fillna(df['rating.avg'].mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['runtime'].fillna(df['runtime'].median(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate objec

In [105]:
genre_of_interest = ['Drama', 'Comedy', 'Horror']

genre_true_counts = {genre: 0 for genre in genre_of_interest}
genre_tp_counts = {genre: 0 for genre in genre_of_interest}
genre_fp_counts = {genre: 0 for genre in genre_of_interest}

for idx, row in model_pred_df.iterrows():
    actual_genres = row['actual genres']

    for true_g in genre_of_interest:
        if true_g in actual_genres:
            genre_true_counts[true_g] += 1
    pred_g = row['predicted']
    correct = row['correct?']

    for genre in genre_of_interest:
        if pred_g == genre:
            if correct == 1:
                genre_tp_counts[genre] += 1
            else:
                genre_fp_counts[genre] += 1

precision = {}
recall = {}
f1_score = {}
accuracy = 0
correct_predictions = 0

for genre in genre_of_interest:
    tp = genre_tp_counts[genre]
    fp = genre_fp_counts[genre]
    fn = genre_true_counts[genre] - tp

    precision[genre] = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall[genre] = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score[genre] = 2 * (precision[genre] * recall[genre]) / (precision[genre] + recall[genre]) if (precision[genre] + recall[genre]) > 0 else 0

    correct_predictions += tp
    accuracy = correct_predictions / len(model_pred_df)

print("for Model 1")
print(f"Accuracy: {accuracy: .4f}")
for genre in genre_of_interest:
    print(f"{genre} - Precision: {precision[genre]: .4f}, Recall: {recall[genre]: .4f}, F1: {f1_score[genre]: .4f}")

for Model 1
Accuracy:  0.0393
Drama - Precision:  0.5000, Recall:  0.0440, F1:  0.0809
Comedy - Precision:  0.3043, Recall:  0.0365, F1:  0.0651
Horror - Precision:  0.1446, Recall:  0.0380, F1:  0.0602


In [106]:
model_pred_df = pd.read_csv('prediction_model_02.csv')

In [107]:
genre_of_interest = ['Drama', 'Comedy', 'Horror']

genre_true_counts = {genre: 0 for genre in genre_of_interest}
genre_tp_counts = {genre: 0 for genre in genre_of_interest}
genre_fp_counts = {genre: 0 for genre in genre_of_interest}

for idx, row in model_pred_df.iterrows():
    actual_genres = row['actual genres']

    for true_g in genre_of_interest:
        if true_g in actual_genres:
            genre_true_counts[true_g] += 1
    pred_g = row['predicted']
    correct = row['correct?']

    for genre in genre_of_interest:
        if pred_g == genre:
            if correct == 1:
                genre_tp_counts[genre] += 1
            else:
                genre_fp_counts[genre] += 1

precision = {}
recall = {}
f1_score = {}
accuracy = 0
correct_predictions = 0

for genre in genre_of_interest:
    tp = genre_tp_counts[genre]
    fp = genre_fp_counts[genre]
    fn = genre_true_counts[genre] - tp

    precision[genre] = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall[genre] = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score[genre] = 2 * (precision[genre] * recall[genre]) / (precision[genre] + recall[genre]) if (precision[genre] + recall[genre]) > 0 else 0

    correct_predictions += tp
    accuracy = correct_predictions / len(model_pred_df)

print("for Model 2")
print(f"Accuracy: {accuracy: .4f}")
for genre in genre_of_interest:
    print(f"{genre} - Precision: {precision[genre]: .4f}, Recall: {recall[genre]: .4f}, F1: {f1_score[genre]: .4f}")

for Model 2
Accuracy:  0.4727
Drama - Precision:  0.6219, Recall:  0.6681, F1:  0.6442
Comedy - Precision:  0.5273, Recall:  0.2799, F1:  0.3657
Horror - Precision:  0.5988, Recall:  0.3180, F1:  0.4154


In [108]:
model_pred_df = pd.read_csv('prediction_model_03.csv')

In [109]:
genre_of_interest = ['Drama', 'Comedy', 'Horror']

genre_true_counts = {genre: 0 for genre in genre_of_interest}
genre_tp_counts = {genre: 0 for genre in genre_of_interest}
genre_fp_counts = {genre: 0 for genre in genre_of_interest}

for idx, row in model_pred_df.iterrows():
    actual_genres = row['actual genres']

    for true_g in genre_of_interest:
        if true_g in actual_genres:
            genre_true_counts[true_g] += 1
    pred_g = row['predicted']
    correct = row['correct?']

    for genre in genre_of_interest:
        if pred_g == genre:
            if correct == 1:
                genre_tp_counts[genre] += 1
            else:
                genre_fp_counts[genre] += 1

precision = {}
recall = {}
f1_score = {}
accuracy = 0
correct_predictions = 0

for genre in genre_of_interest:
    tp = genre_tp_counts[genre]
    fp = genre_fp_counts[genre]
    fn = genre_true_counts[genre] - tp

    precision[genre] = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall[genre] = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score[genre] = 2 * (precision[genre] * recall[genre]) / (precision[genre] + recall[genre]) if (precision[genre] + recall[genre]) > 0 else 0

    correct_predictions += tp
    accuracy = correct_predictions / len(model_pred_df)

print("for Model 3")
print(f"Accuracy: {accuracy: .4f}")
for genre in genre_of_interest:
    print(f"{genre} - Precision: {precision[genre]: .4f}, Recall: {recall[genre]: .4f}, F1: {f1_score[genre]: .4f}")

for Model 3
Accuracy:  0.4981
Drama - Precision:  0.4981, Recall:  1.0000, F1:  0.6650
Comedy - Precision:  0.0000, Recall:  0.0000, F1:  0.0000
Horror - Precision:  0.0000, Recall:  0.0000, F1:  0.0000


In [110]:
import json
from collections import Counter

movies = []
with open('imdb_movies_2000to2022.prolific.json', 'r') as f:
    for line in f:
        movies.append(json.loads(line))

def jaccard_similarity(actors1, actors2):
    intersection = actors1.intersection(actors2)
    union = actors1.union(actors2)
    return len(intersection)/len(union)

def predict_genre(target_movie_id, k):
    for movie in movies:
        if movie['imdb_id'] == target_movie_id:
            target_movie = movie
            break

    target_actors = {actor[0] for actor in target_movie['actors']}

    similarities = []

    for movie in movies:
        if movie['imdb_id'] != target_movie_id:
            movie_actors = {actor[0] for actor in movie['actors']}
            similarity = jaccard_similarity(target_actors, movie_actors)
            similarities.append((similarity, movie['genres']))
    similarities.sort(reverse = True) # sort in descending order
    top_k_genres = []
    for similarity, genres in similarities[:k]: # slicing is from 0 to k b/c only need nearest k values
        for genre in genres:
            top_k_genres.append(genre)

    genre_counts = Counter(top_k_genres)
    max_count = max(genre_counts.values())
    print(genre_counts)
    most_common_genres = []
    for genre, count in genre_counts.items():
        if count == max_count:
            most_common_genres.append(genre)
    print(most_common_genres)
    return most_common_genres

In [111]:
for k in range(1,10):
    print(k)
    train_pred = predict_genre('tt0317705', k)
    for movie in movies:
        if movie['imdb_id'] == 'tt0317705':
            actual_genres={genre for genre in movie['genres']}
            print(actual_genres)
    
    for predic in train_pred:
        if predic in actual_genres:
            print(True)

1
Counter({'Action': 1, 'Adventure': 1, 'Animation': 1})
['Action', 'Adventure', 'Animation']
{'Adventure', 'Animation', 'Action'}
True
True
True
2
Counter({'Action': 1, 'Adventure': 1, 'Animation': 1, 'Documentary': 1, 'History': 1})
['Action', 'Adventure', 'Animation', 'Documentary', 'History']
{'Adventure', 'Animation', 'Action'}
True
True
True
3
Counter({'Documentary': 2, 'Action': 1, 'Adventure': 1, 'Animation': 1, 'History': 1})
['Documentary']
{'Adventure', 'Animation', 'Action'}
4
Counter({'Documentary': 3, 'Action': 1, 'Adventure': 1, 'Animation': 1, 'History': 1, 'Comedy': 1, 'Music': 1})
['Documentary']
{'Adventure', 'Animation', 'Action'}
5
Counter({'Documentary': 4, 'Adventure': 2, 'History': 2, 'Action': 1, 'Animation': 1, 'Comedy': 1, 'Music': 1})
['Documentary']
{'Adventure', 'Animation', 'Action'}
6
Counter({'Documentary': 5, 'Adventure': 3, 'History': 2, 'Action': 1, 'Animation': 1, 'Comedy': 1, 'Music': 1})
['Documentary']
{'Adventure', 'Animation', 'Action'}
7
Count

In [112]:
for k in range(1,10):
    print(k)
    train_pred = predict_genre('tt0118926', k)
    for movie in movies:
        if movie['imdb_id'] == 'tt0118926':
            actual_genres={genre for genre in movie['genres']}
            print(actual_genres)
    
    for predic in train_pred:
        if predic in actual_genres:
            print(True)

1
Counter({'Drama': 1, 'Romance': 1, 'War': 1})
['Drama', 'Romance', 'War']
{'Crime', 'Drama', 'Thriller'}
True
2
Counter({'Drama': 2, 'Romance': 2, 'War': 1})
['Drama', 'Romance']
{'Crime', 'Drama', 'Thriller'}
True
3
Counter({'Drama': 3, 'Romance': 3, 'War': 1})
['Drama', 'Romance']
{'Crime', 'Drama', 'Thriller'}
True
4
Counter({'Drama': 4, 'Romance': 4, 'War': 1})
['Drama', 'Romance']
{'Crime', 'Drama', 'Thriller'}
True
5
Counter({'Drama': 5, 'Romance': 5, 'War': 1})
['Drama', 'Romance']
{'Crime', 'Drama', 'Thriller'}
True
6
Counter({'Drama': 6, 'Romance': 5, 'War': 1, 'Horror': 1, 'Mystery': 1})
['Drama']
{'Crime', 'Drama', 'Thriller'}
True
7
Counter({'Drama': 7, 'Romance': 5, 'War': 1, 'Horror': 1, 'Mystery': 1})
['Drama']
{'Crime', 'Drama', 'Thriller'}
True
8
Counter({'Drama': 8, 'Romance': 5, 'War': 1, 'Horror': 1, 'Mystery': 1, 'Crime': 1, 'Thriller': 1})
['Drama']
{'Crime', 'Drama', 'Thriller'}
True
9
Counter({'Drama': 9, 'Romance': 5, 'Crime': 2, 'Thriller': 2, 'War': 1, 'Hor

In [117]:
for k in range(1,10):
    print(k)
    train_pred = predict_genre('tt0119273', k)
    for movie in movies:
        if movie['imdb_id'] == 'tt0119273':
            actual_genres={genre for genre in movie['genres']}
            print(actual_genres)
    
    for predic in train_pred:
        if predic in actual_genres:
            print(True)

1
Counter({'Thriller': 1})
['Thriller']
{'Adventure', 'Animation', 'Action'}
2
Counter({'Thriller': 2})
['Thriller']
{'Adventure', 'Animation', 'Action'}
3
Counter({'Thriller': 3})
['Thriller']
{'Adventure', 'Animation', 'Action'}
4
Counter({'Thriller': 3, 'Sci-Fi': 1})
['Thriller']
{'Adventure', 'Animation', 'Action'}
5
Counter({'Thriller': 4, 'Sci-Fi': 1, 'Horror': 1})
['Thriller']
{'Adventure', 'Animation', 'Action'}
6
Counter({'Thriller': 4, 'Horror': 2, 'Sci-Fi': 1, 'Fantasy': 1, 'Mystery': 1})
['Thriller']
{'Adventure', 'Animation', 'Action'}
7
Counter({'Thriller': 4, 'Horror': 3, 'Fantasy': 2, 'Sci-Fi': 1, 'Mystery': 1})
['Thriller']
{'Adventure', 'Animation', 'Action'}
8
Counter({'Thriller': 5, 'Horror': 3, 'Fantasy': 2, 'Sci-Fi': 1, 'Mystery': 1, 'Drama': 1})
['Thriller']
{'Adventure', 'Animation', 'Action'}
9
Counter({'Thriller': 6, 'Horror': 3, 'Fantasy': 2, 'Drama': 2, 'Sci-Fi': 1, 'Mystery': 1})
['Thriller']
{'Adventure', 'Animation', 'Action'}


In [118]:
for k in range(1,10):
    print(k)
    train_pred = predict_genre('tt0120667', k)
    for movie in movies:
        if movie['imdb_id'] == 'tt0120667':
            actual_genres={genre for genre in movie['genres']}
            print(actual_genres)
    
    for predic in train_pred:
        if predic in actual_genres:
            print(True)

1
Counter({'Documentary': 1})
['Documentary']
{'Fantasy', 'Adventure', 'Action'}
2
Counter({'Documentary': 2})
['Documentary']
{'Fantasy', 'Adventure', 'Action'}
3
Counter({'Documentary': 2, 'Drama': 1, 'Music': 1, 'Romance': 1})
['Documentary']
{'Fantasy', 'Adventure', 'Action'}
4
Counter({'Documentary': 2, 'Drama': 1, 'Music': 1, 'Romance': 1, 'Sci-Fi': 1, 'Thriller': 1})
['Documentary']
{'Fantasy', 'Adventure', 'Action'}
5
Counter({'Documentary': 2, 'Thriller': 2, 'Drama': 1, 'Music': 1, 'Romance': 1, 'Sci-Fi': 1, 'Horror': 1})
['Documentary', 'Thriller']
{'Fantasy', 'Adventure', 'Action'}
6
Counter({'Thriller': 3, 'Documentary': 2, 'Sci-Fi': 2, 'Horror': 2, 'Drama': 1, 'Music': 1, 'Romance': 1})
['Thriller']
{'Fantasy', 'Adventure', 'Action'}
7
Counter({'Thriller': 4, 'Horror': 3, 'Documentary': 2, 'Sci-Fi': 2, 'Drama': 1, 'Music': 1, 'Romance': 1, 'Mystery': 1})
['Thriller']
{'Fantasy', 'Adventure', 'Action'}
8
Counter({'Thriller': 4, 'Horror': 4, 'Documentary': 2, 'Sci-Fi': 2, 'M

In [119]:
for k in range(1,10):
    print(k)
    train_pred = predict_genre('tt0035423', k)
    for movie in movies:
        if movie['imdb_id'] == 'tt0035423':
            actual_genres={genre for genre in movie['genres']}
            print(actual_genres)
    
    for predic in train_pred:
        if predic in actual_genres:
            print(True)

1
Counter({'Action': 1, 'Sci-Fi': 1})
['Action', 'Sci-Fi']
{'Romance', 'Comedy', 'Fantasy'}
2
Counter({'Action': 1, 'Sci-Fi': 1, 'Documentary': 1, 'Sport': 1})
['Action', 'Sci-Fi', 'Documentary', 'Sport']
{'Romance', 'Comedy', 'Fantasy'}
3
Counter({'Documentary': 2, 'Action': 1, 'Sci-Fi': 1, 'Sport': 1, 'History': 1, 'News': 1})
['Documentary']
{'Romance', 'Comedy', 'Fantasy'}
4
Counter({'Documentary': 3, 'Action': 1, 'Sci-Fi': 1, 'Sport': 1, 'History': 1, 'News': 1})
['Documentary']
{'Romance', 'Comedy', 'Fantasy'}
5
Counter({'Documentary': 4, 'Action': 1, 'Sci-Fi': 1, 'Sport': 1, 'History': 1, 'News': 1})
['Documentary']
{'Romance', 'Comedy', 'Fantasy'}
6
Counter({'Documentary': 5, 'Sport': 2, 'Action': 1, 'Sci-Fi': 1, 'History': 1, 'News': 1, 'Biography': 1})
['Documentary']
{'Romance', 'Comedy', 'Fantasy'}
7
Counter({'Documentary': 6, 'Sport': 2, 'Biography': 2, 'Action': 1, 'Sci-Fi': 1, 'History': 1, 'News': 1, 'Animation': 1})
['Documentary']
{'Romance', 'Comedy', 'Fantasy'}
8
Co

In [120]:
for k in range(1,10):
    print(k)
    train_pred = predict_genre('tt0122459', k)
    for movie in movies:
        if movie['imdb_id'] == 'tt0122459':
            actual_genres={genre for genre in movie['genres']}
            print(actual_genres)
    
    for predic in train_pred:
        if predic in actual_genres:
            print(True)

1
Counter({'Documentary': 1, 'Sport': 1})
['Documentary', 'Sport']
{'Romance', 'Comedy', 'Drama'}
2
Counter({'Documentary': 2, 'Sport': 1, 'Family': 1})
['Documentary']
{'Romance', 'Comedy', 'Drama'}
3
Counter({'Documentary': 2, 'Sport': 1, 'Family': 1, 'Horror': 1, 'Thriller': 1})
['Documentary']
{'Romance', 'Comedy', 'Drama'}
4
Counter({'Documentary': 2, 'Horror': 2, 'Sport': 1, 'Family': 1, 'Thriller': 1})
['Documentary', 'Horror']
{'Romance', 'Comedy', 'Drama'}
5
Counter({'Documentary': 2, 'Family': 2, 'Horror': 2, 'Sport': 1, 'Thriller': 1, 'Music': 1, 'Romance': 1})
['Documentary', 'Family', 'Horror']
{'Romance', 'Comedy', 'Drama'}
6
Counter({'Documentary': 2, 'Family': 2, 'Horror': 2, 'Thriller': 2, 'Sport': 1, 'Music': 1, 'Romance': 1, 'Drama': 1, 'Mystery': 1})
['Documentary', 'Family', 'Horror', 'Thriller']
{'Romance', 'Comedy', 'Drama'}
7
Counter({'Thriller': 3, 'Documentary': 2, 'Family': 2, 'Horror': 2, 'Drama': 2, 'Mystery': 2, 'Sport': 1, 'Music': 1, 'Romance': 1})
['Thr

In [113]:
mlb = MultiLabelBinarizer()
genre_labels = mlb.fit_transform(df['genres'])
genres_df = pd.DataFrame(genre_labels, columns = mlb.classes_)

In [114]:
x = df[['runtime', 'year', 'rating.avg']]
y = genres_df

In [115]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [116]:
for depth in range(1, 20):
    clf = DecisionTreeClassifier(max_depth = depth)
    clf.fit(x_train, y_train)
    test_pred = clf.predict(x_test)
    test_correct = []
    for i in range(len(test_pred)):
        test = test_pred[i]
        actual = y_test.values[i]
        if any(test[j] == 1 and actual[j] == 1 for j in range(len(test))):
            test_correct.append(True)
        else:
            test_correct.append(False)
    test_accuracy = np.mean(test_correct)
    print(f'max_depth = {depth} | test_accuracy: {test_accuracy: .2f}')

max_depth = 1 | test_accuracy:  0.41
max_depth = 2 | test_accuracy:  0.24
max_depth = 3 | test_accuracy:  0.27
max_depth = 4 | test_accuracy:  0.39
max_depth = 5 | test_accuracy:  0.38
max_depth = 6 | test_accuracy:  0.34
max_depth = 7 | test_accuracy:  0.35
max_depth = 8 | test_accuracy:  0.35
max_depth = 9 | test_accuracy:  0.37
max_depth = 10 | test_accuracy:  0.38
max_depth = 11 | test_accuracy:  0.38
max_depth = 12 | test_accuracy:  0.39
max_depth = 13 | test_accuracy:  0.39
max_depth = 14 | test_accuracy:  0.40
max_depth = 15 | test_accuracy:  0.41
max_depth = 16 | test_accuracy:  0.42
max_depth = 17 | test_accuracy:  0.44
max_depth = 18 | test_accuracy:  0.44
max_depth = 19 | test_accuracy:  0.44
