In [None]:

#import all libraries that will need to be used throughout the notebook
import pandas as pd
import requests
from dotenv import load_dotenv
import os
import time
import json

# Load API key
load_dotenv()
API_KEY = os.getenv('TMDB_API_KEY')

print(f"API Key loaded: {'Yes' if API_KEY else 'No'}")

In [None]:
# collecting movie data across multiple years

def collect_movie_data(start_year=2010, end_year=2024, pages_per_year=5):
    """
    Collect movie data across multiple years
    """
    all_movies = []
    
    for year in range(start_year, end_year + 1):
        print(f"\nCollecting movies from {year}...")
        
        for page in range(1, pages_per_year + 1):
            movies = discover_movies(page=page, year=year)
            
            # Fethces detailed info for each movie printed
            for movie in movies:
                movie_id = movie['id']
                details = get_movie_details(movie_id)
                
                if details and details.get('budget', 0) > 0 and details.get('revenue', 0) > 0:
                    # Only keep movies with budget AND revenue data
                    all_movies.append(details)

                time.sleep(0.25)  #allows 4 requests per second to avoid rate limiting

            print(f"  Page {page}: {len(all_movies)} movies with budget/revenue data")
            time.sleep(0.5)
    
    return all_movies

print("Starting data collection...")
print("This will take a few minutes!")
movies_data = collect_movie_data(start_year=2020, end_year=2023, pages_per_year=3)
print(f"\n✓ Collection complete! Total movies: {len(movies_data)}")

In [None]:
# Convert to pandas DataFrame
df = pd.DataFrame(movies_data)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns available:")
print(df.columns.tolist())
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

print(f"\n--- Basic Statistics ---")
print(f"Average budget: ${df['budget'].mean():,.0f}")
print(f"Average revenue: ${df['revenue'].mean():,.0f}")
print(f"Average rating: {df['vote_average'].mean():.2f}")

# Look at revenue distribution
print(f"\nRevenue distribution:")
print(df['revenue'].describe())

In [None]:
# Save to CSV so you don't have to fetch again
df.to_csv('../data/movies_raw.csv', index=False)
print("✓ Data saved to data/movies_raw.csv")

# Also save as JSON backup
with open('../data/movies_raw.json', 'w') as f:
    json.dump(movies_data, f)
print("✓ Backup saved to data/movies_raw.json")