# Saving the tropes data in json files to be used in the web app

### 1. Filtering tropes with a minimum number of votes

In [1]:
import pandas as pd
import json

from pathlib import Path

DATA_PATH = '../../data/'
OUTPUT_PATH = '../../data/preprocessed/'

MIN_VOTES = 100

df_tropes = pd.read_csv(f'{DATA_PATH}imdb_tropes.csv')
df_tropes = df_tropes[['trope_id', 'trope', 'description', 'example']]
df_tropes.drop_duplicates(subset='trope_id', keep='first', inplace=True)

df_cmu_tropes = pd.read_csv(f'{DATA_PATH}cmu_tropes.csv')

df_tropes_filtered = df_cmu_tropes[
    (df_cmu_tropes[["vote_average", "revenue"]] != 0).all(axis=1)
]

df_tropes_filtered = df_tropes_filtered[df_tropes_filtered['vote_count'] > MIN_VOTES]
df_tropes_filtered = df_tropes_filtered.sort_values(by="vote_average")
df_tropes_filtered.reset_index(drop=True, inplace=True)
print(f"Number of unique tropes: {df_tropes_filtered['trope'].nunique()}")
print(f"Number of unique movies: {df_tropes_filtered['imdb_id'].nunique()}")
print(f"Shape of the filtered dataset: {df_tropes_filtered.shape}")

# If the output path does not exist, we create it
Path(OUTPUT_PATH).mkdir(parents=True, exist_ok=True)

Number of unique tropes: 23667
Number of unique movies: 2549
Shape of the filtered dataset: (203581, 11)


### 2. Getting genres

In [2]:
unique_str_genres = df_tropes_filtered['genres'].unique()
unique_genres = set()

for str_genres in unique_str_genres:
    for genre in str_genres.split(","):
        unique_genres.add(genre.strip())

unique_genres.add("All")

print(f"{len(unique_genres)} unique genres: {unique_genres}")

19 unique genres: {'War', 'Drama', 'All', 'Science Fiction', 'Documentary', 'Fantasy', 'Romance', 'Comedy', 'Mystery', 'Family', 'Thriller', 'Animation', 'Adventure', 'Horror', 'Action', 'Crime', 'History', 'Music', 'Western'}


### 3. Top k tropes for each genre and in general

In [3]:
THRESHOLD = 6.0
K = 5

df_low_rated_tropes = df_tropes_filtered[df_tropes_filtered['vote_average'] < THRESHOLD]
df_high_rated_tropes = df_tropes_filtered[df_tropes_filtered['vote_average'] >= THRESHOLD]

for genre in unique_genres:
    if genre == "All":
        df_lr_genre_tropes = df_low_rated_tropes
        df_hr_genre_tropes = df_high_rated_tropes
    else:
        df_lr_genre_tropes = df_low_rated_tropes[df_low_rated_tropes['genres'].str.contains(genre)]
        df_hr_genre_tropes = df_high_rated_tropes[df_high_rated_tropes['genres'].str.contains(genre)]

    low_rated_tropes = df_lr_genre_tropes.trope_id.value_counts().to_dict()
    high_rated_tropes = df_hr_genre_tropes.trope_id.value_counts().to_dict()

    trope_ratios = {}
    for trope_id in low_rated_tropes:
        low_count = low_rated_tropes[trope_id]
        high_count = high_rated_tropes.get(trope_id, 0)

        if low_count >= 5:
            ratio = low_count / (high_count + 1)
            trope_ratios[trope_id] = ratio

    if len(trope_ratios) < K:
        continue

    sorted_tropes = sorted(trope_ratios.items(), key=lambda x: x[1], reverse=True)[:K]
    df = pd.DataFrame(sorted_tropes, columns=["trope_id", "ratio"])
    df = df.merge(df_tropes, on="trope_id", how="inner")
    df["img_url"] = ""
    df.to_json(f'{OUTPUT_PATH}{genre.lower()}_tropes.json', orient='records')
    print(f"Genre {genre} has {len(sorted_tropes)} tropes with a ratio of low-rated movies to high-rated movies")

Genre Drama has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre All has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Science Fiction has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Fantasy has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Romance has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Comedy has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Mystery has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Family has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Thriller has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Adventure has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Horror has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Action has 5 tropes with a ratio of low-rated movies to high-rated movies
Genre Crime has 5 tropes wi