# Movie Data Exploratory Data Analysis (EDA)

This notebook provides a comprehensive analysis of our movie dataset, exploring trends in box office performance, genre preferences, studio success, and seasonal patterns.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Data Loading and Overview

In [None]:
# Load processed datasets
movies = pd.read_csv('../data/processed/movies_processed.csv')
sales = pd.read_csv('../data/processed/sales_processed.csv')
genre_stats = pd.read_csv('../data/processed/genre_stats.csv')
studio_stats = pd.read_csv('../data/processed/studio_stats.csv')
monthly_sales = pd.read_csv('../data/processed/monthly_sales.csv')

# Convert date columns
movies['release_date'] = pd.to_datetime(movies['release_date'])
sales['date'] = pd.to_datetime(sales['date'])

print("Dataset Shapes:")
print(f"Movies: {movies.shape}")
print(f"Sales: {sales.shape}")
print(f"Genre Stats: {genre_stats.shape}")
print(f"Studio Stats: {studio_stats.shape}")

In [None]:
# Display basic information about the movies dataset
movies.head()

In [None]:
# Basic statistics
movies.describe()

## 2. Genre Analysis

In [None]:
# Genre distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Movie count by genre
genre_counts = movies['genre'].value_counts()
axes[0,0].pie(genre_counts.values, labels=genre_counts.index, autopct='%1.1f%%')
axes[0,0].set_title('Movie Distribution by Genre')

# Average gross by genre
genre_stats_sorted = genre_stats.sort_values('avg_gross', ascending=True)
axes[0,1].barh(genre_stats_sorted['genre'], genre_stats_sorted['avg_gross'])
axes[0,1].set_title('Average Gross Revenue by Genre')
axes[0,1].set_xlabel('Average Gross ($)')

# Average rating by genre
genre_stats_rating = genre_stats.sort_values('avg_rating', ascending=True)
axes[1,0].barh(genre_stats_rating['genre'], genre_stats_rating['avg_rating'])
axes[1,0].set_title('Average IMDb Rating by Genre')
axes[1,0].set_xlabel('Average Rating')

# Total gross by genre
genre_stats_total = genre_stats.sort_values('total_gross', ascending=True)
axes[1,1].barh(genre_stats_total['genre'], genre_stats_total['total_gross'])
axes[1,1].set_title('Total Gross Revenue by Genre')
axes[1,1].set_xlabel('Total Gross ($)')

plt.tight_layout()
plt.show()

## 3. Studio Performance Analysis

In [None]:
# Top 10 studios by total gross
top_studios = studio_stats.nlargest(10, 'total_gross')

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Total gross by studio
axes[0].bar(range(len(top_studios)), top_studios['total_gross'])
axes[0].set_xticks(range(len(top_studios)))
axes[0].set_xticklabels(top_studios['studio'], rotation=45, ha='right')
axes[0].set_title('Top 10 Studios by Total Gross Revenue')
axes[0].set_ylabel('Total Gross ($)')

# Movie count vs average gross
axes[1].scatter(studio_stats['movie_count'], studio_stats['avg_gross'], 
               s=studio_stats['total_gross']/1e6, alpha=0.6)
axes[1].set_xlabel('Number of Movies')
axes[1].set_ylabel('Average Gross per Movie ($)')
axes[1].set_title('Studio Performance: Movie Count vs Average Gross\n(Bubble size = Total Gross)')

# Add studio labels for top performers
for idx, row in studio_stats.nlargest(5, 'total_gross').iterrows():
    axes[1].annotate(row['studio'], (row['movie_count'], row['avg_gross']), 
                    xytext=(5, 5), textcoords='offset points', fontsize=8)

plt.tight_layout()
plt.show()

## 4. Budget vs Performance Analysis

In [None]:
# Budget vs Gross scatter plot
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Budget vs Total Gross
scatter = axes[0,0].scatter(movies['budget'], movies['total_gross'], 
                          c=movies['imdb_rating'], cmap='viridis', alpha=0.6)
axes[0,0].set_xlabel('Budget ($)')
axes[0,0].set_ylabel('Total Gross ($)')
axes[0,0].set_title('Budget vs Total Gross (Color = IMDb Rating)')
plt.colorbar(scatter, ax=axes[0,0])

# ROI distribution
axes[0,1].hist(movies['roi'], bins=50, alpha=0.7, edgecolor='black')
axes[0,1].set_xlabel('Return on Investment (%)')
axes[0,1].set_ylabel('Number of Movies')
axes[0,1].set_title('Distribution of ROI')
axes[0,1].axvline(movies['roi'].median(), color='red', linestyle='--', 
                  label=f'Median: {movies["roi"].median():.1f}%')
axes[0,1].legend()

# Budget category analysis
budget_perf = movies.groupby('budget_category')['roi'].agg(['mean', 'median', 'count'])
axes[1,0].bar(budget_perf.index, budget_perf['mean'])
axes[1,0].set_title('Average ROI by Budget Category')
axes[1,0].set_ylabel('Average ROI (%)')
axes[1,0].tick_params(axis='x', rotation=45)

# Performance category distribution
perf_counts = movies['performance'].value_counts()
axes[1,1].pie(perf_counts.values, labels=perf_counts.index, autopct='%1.1f%%')
axes[1,1].set_title('Movie Performance Categories')

plt.tight_layout()
plt.show()

## 5. Time Series Analysis

In [None]:
# Release trends over time
release_trends = movies.groupby('release_year').agg({
    'movie_id': 'count',
    'total_gross': ['mean', 'sum'],
    'budget': 'mean',
    'imdb_rating': 'mean'
})

release_trends.columns = ['movie_count', 'avg_gross', 'total_gross', 'avg_budget', 'avg_rating']
release_trends = release_trends.reset_index()

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Movies released per year
axes[0,0].plot(release_trends['release_year'], release_trends['movie_count'], marker='o')
axes[0,0].set_title('Number of Movies Released per Year')
axes[0,0].set_xlabel('Year')
axes[0,0].set_ylabel('Movie Count')

# Average gross per year
axes[0,1].plot(release_trends['release_year'], release_trends['avg_gross'], marker='o', color='green')
axes[0,1].set_title('Average Gross Revenue per Year')
axes[0,1].set_xlabel('Year')
axes[0,1].set_ylabel('Average Gross ($)')

# Average budget per year
axes[1,0].plot(release_trends['release_year'], release_trends['avg_budget'], marker='o', color='red')
axes[1,0].set_title('Average Budget per Year')
axes[1,0].set_xlabel('Year')
axes[1,0].set_ylabel('Average Budget ($)')

# Average rating per year
axes[1,1].plot(release_trends['release_year'], release_trends['avg_rating'], marker='o', color='purple')
axes[1,1].set_title('Average IMDb Rating per Year')
axes[1,1].set_xlabel('Year')
axes[1,1].set_ylabel('Average Rating')

plt.tight_layout()
plt.show()

## 6. Daily Sales Analysis

In [None]:
# Daily sales trends
daily_sales_agg = sales.groupby('date').agg({
    'tickets_sold': 'sum',
    'revenue': 'sum',
    'movie_id': 'nunique'
}).reset_index()

fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Total daily ticket sales
axes[0].plot(daily_sales_agg['date'], daily_sales_agg['tickets_sold'], alpha=0.7)
axes[0].set_title('Daily Ticket Sales Over Time')
axes[0].set_ylabel('Tickets Sold')
axes[0].grid(True, alpha=0.3)

# Daily revenue
axes[1].plot(daily_sales_agg['date'], daily_sales_agg['revenue'], alpha=0.7, color='green')
axes[1].set_title('Daily Revenue Over Time')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Revenue ($)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Weekend vs weekday analysis
weekend_analysis = sales.groupby('is_weekend').agg({
    'tickets_sold': 'mean',
    'revenue': 'mean'
}).reset_index()

weekend_analysis['day_type'] = weekend_analysis['is_weekend'].map({True: 'Weekend', False: 'Weekday'})

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Average tickets sold: Weekend vs Weekday
axes[0].bar(weekend_analysis['day_type'], weekend_analysis['tickets_sold'])
axes[0].set_title('Average Daily Tickets Sold: Weekend vs Weekday')
axes[0].set_ylabel('Average Tickets Sold')

# Average revenue: Weekend vs Weekday
axes[1].bar(weekend_analysis['day_type'], weekend_analysis['revenue'], color='green')
axes[1].set_title('Average Daily Revenue: Weekend vs Weekday')
axes[1].set_ylabel('Average Revenue ($)')

plt.tight_layout()
plt.show()

# Print exact numbers
print("Weekend vs Weekday Performance:")
print(weekend_analysis)

## 7. Key Insights and Conclusions

In [None]:
# Generate key insights
print("=== KEY INSIGHTS FROM MOVIE DATA ANALYSIS ===")
print()

# Top performing metrics
top_genre_gross = genre_stats.loc[genre_stats['avg_gross'].idxmax(), 'genre']
top_genre_rating = genre_stats.loc[genre_stats['avg_rating'].idxmax(), 'genre']
top_studio = studio_stats.loc[studio_stats['total_gross'].idxmax(), 'studio']
best_roi = movies.loc[movies['roi'].idxmax(), 'title']
worst_roi = movies.loc[movies['roi'].idxmin(), 'title']

print(f"📊 GENRE INSIGHTS:")
print(f"   • Highest grossing genre: {top_genre_gross}")
print(f"   • Highest rated genre: {top_genre_rating}")
print(f"   • Most popular genre: {movies['genre'].mode()[0]}")
print()

print(f"🎬 STUDIO INSIGHTS:")
print(f"   • Top grossing studio: {top_studio}")
print(f"   • Most active studio: {studio_stats.loc[studio_stats['movie_count'].idxmax(), 'studio']}")
print()

print(f"💰 FINANCIAL INSIGHTS:")
print(f"   • Best ROI movie: {best_roi} ({movies.loc[movies['roi'].idxmax(), 'roi']:.1f}%)")
print(f"   • Average ROI: {movies['roi'].mean():.1f}%")
print(f"   • Median budget: ${movies['budget'].median():,.0f}")
print(f"   • Median gross: ${movies['total_gross'].median():,.0f}")
print()

print(f"📅 TEMPORAL INSIGHTS:")
weekend_boost = (weekend_analysis[weekend_analysis['day_type'] == 'Weekend']['tickets_sold'].values[0] /
                weekend_analysis[weekend_analysis['day_type'] == 'Weekday']['tickets_sold'].values[0] - 1) * 100
print(f"   • Weekend sales boost: {weekend_boost:.1f}% higher than weekdays")
print(f"   • Peak release year: {release_trends.loc[release_trends['movie_count'].idxmax(), 'release_year']}")
print()

print(f"⭐ RATING INSIGHTS:")
print(f"   • Average IMDb rating: {movies['imdb_rating'].mean():.1f}/10")
print(f"   • Highest rated movie: {movies.loc[movies['imdb_rating'].idxmax(), 'title']} ({movies['imdb_rating'].max():.1f}/10)")
print()

# Performance categories
profitable_pct = (len(movies[movies['profit'] > 0]) / len(movies)) * 100
print(f"📈 SUCCESS METRICS:")
print(f"   • Profitable movies: {profitable_pct:.1f}%")
print(f"   • Movies with ROI > 100%: {len(movies[movies['roi'] > 100])}")
print(f"   • Blockbuster movies (budget > $100M): {len(movies[movies['budget'] > 100_000_000])}")