In [5]:
import pandas as pd
import json

# ------------------------ #
# 1. Load the Dataset
# ------------------------ #

# Replace 'movies.csv' with the path to your dataset if it's located elsewhere
df = pd.read_csv('../../data/cmu_tmdb.csv')

# Display initial data info
print("Initial DataFrame shape:", df.shape)
print(df.head())

# ------------------------ #
# 2. Data Cleaning
# ------------------------ #

# 2.1 Remove rows where vote_count is 0
df = df[df['vote_count'] > 0]
print("\nAfter removing rows with vote_count = 0:", df.shape)

# 2.2 Remove rows where revenue is 0
df = df[df['revenue'] > 0]
print("After removing rows with revenue = 0:", df.shape)

# 2.3 Remove duplicate movies based on 'id'
df = df.drop_duplicates(subset='id')
print("After removing duplicate movies:", df.shape)

# ------------------------ #
# 3. Select Relevant Columns
# ------------------------ #

relevant_cols = [
    'id', 'title', 'vote_average', 'vote_count', 'revenue', 'budget'
]
df = df[relevant_cols].copy()
print("\nSelected relevant columns:", df.shape)

# ------------------------ #
# 4. Compute Additional Metrics
# ------------------------ #

# Compute profit
df['profit'] = df['revenue'] - df['budget']

# Remove movies with budget <= 0 to avoid division by zero or negative ROI
df = df[df['budget'] > 0]
print("\nAfter removing movies with budget <= 0:", df.shape)

# Compute ROI
df['ROI'] = df['profit'] / df['budget']

# Optional: Handle extreme ROI values if necessary
# For example, limit ROI to a reasonable range
# df = df[df['ROI'].between(-1, 5)]

# ------------------------ #
# 5. Save Processed Data as JSON
# ------------------------ #

# Convert DataFrame to dictionary
processed_data = df.to_dict(orient='records')

# Save to 'metrics.json'
with open('metrics.json', 'w') as f:
    json.dump(processed_data, f, indent=2)

print("\nProcessing complete. 'metrics.json' has been created.")



Initial DataFrame shape: (49555, 28)
       id            title  vote_average  vote_count    status release_date  \
0   27205        Inception         8.364       34495  Released   2010-07-15   
1  157336     Interstellar         8.417       32571  Released   2014-11-05   
2     155  The Dark Knight         8.512       30619  Released   2008-07-16   
3   19995           Avatar         7.573       29815  Released   2009-12-15   
4   24428     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult                     backdrop_path  ...  \
0   825532764      148  False  /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg  ...   
1   701729206      169  False  /pbrkL804c8yAv3zBZR4QPEafpAR.jpg  ...   
2  1004558444      152  False  /nMKdUUepR0i5zn0y1T4CsSB5chy.jpg  ...   
3  2923706026      162  False  /vL5LR6WdxWPjLPFRLe133jXWsh5.jpg  ...   
4  1518815515      143  False  /9BBTo63ANSmhC4e6r62OJFuK2GL.jpg  ...   

                                             tagline  \