In [3]:
import pandas as pd
import json

# ------------------------ #
# 1. Load the Dataset
# ------------------------ #

df = pd.read_csv('../../data/cmu_tmdb.csv')
print("Initial DataFrame shape:", df.shape)

# ------------------------ #
# 2. Data Cleaning
# ------------------------ #

# 2.1 Remove rows where vote_average is 0
df = df[df['vote_count'] > 0]
print("\nAfter removing rows with vote_count = 0:", df.shape)
df = df[df['vote_average'] > 0]
print("\nAfter removing rows with vote_average = 0:", df.shape)

# 2.2 Remove rows where revenue is 0
df = df[df["revenue"] > 0]
print("After removing rows with revenue = 0:", df.shape)

# 2.3 Remove duplicate movies based on 'id'
df = df.drop_duplicates(subset="id")
print("After removing duplicate movies:", df.shape)

# ------------------------ #
# 3. Select Relevant Columns
# ------------------------ #

relevant_cols = ["id", "title", "vote_average", "vote_count", "revenue", "budget"]
df = df[relevant_cols].copy()
print("\nSelected relevant columns:", df.shape)

# ------------------------ #
# 4. Compute Additional Metrics
# ------------------------ #

# Compute profit
df["profit"] = df["revenue"] - df["budget"]

# Remove movies with budget <= 0 to avoid division by zero or negative ROI
df = df[df["budget"] > 0]
print("\nAfter removing movies with budget <= 0:", df.shape)

# Compute ROI
df["ROI"] = df["profit"] / df["budget"]

# ------------------------ #
# 5. Remove Outliers
# ------------------------ #

def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]


# Remove outliers for relevant columns
columns_to_check = ["vote_average", "revenue", "ROI"]
for col in columns_to_check:
    before = df.shape[0]
    df = remove_outliers_iqr(df, col)
    after = df.shape[0]
    print(
        f"\nAfter removing outliers in '{col}': {before - after} rows removed, remaining {after} rows"
    )

# ------------------------ #
# 6. Randomly Select Subset of Movies
# ------------------------ #

# Set sample size (e.g., 200 movies) and random state for reproducibility
sample_size = 200
df_sample = df.sample(n=sample_size, random_state=42)

print(f"\nRandomly selected {sample_size} movies for analysis.")

# ------------------------ #
# 7. Save Processed Data as JSON
# ------------------------ #

# Convert sampled DataFrame to dictionary
processed_data = df_sample.to_dict(orient='records')

# Save to 'metrics.json'
with open("../../docs/assets/data/metrics.json", "w") as f:
    json.dump(processed_data, f, indent=2)

print("\nProcessing complete. 'metrics.json' has been created with the selected subset.")


Initial DataFrame shape: (49555, 28)

After removing rows with vote_count = 0: (42075, 28)

After removing rows with vote_average = 0: (42058, 28)
After removing rows with revenue = 0: (7482, 28)
After removing duplicate movies: (7452, 28)

Selected relevant columns: (7452, 6)

After removing movies with budget <= 0: (5661, 7)

After removing outliers in 'vote_average': 83 rows removed, remaining 5578 rows

After removing outliers in 'revenue': 609 rows removed, remaining 4969 rows

After removing outliers in 'ROI': 485 rows removed, remaining 4484 rows

Randomly selected 200 movies for analysis.

Processing complete. 'metrics.json' has been created with the selected subset.
