# Data Acquisition and Exploration

**Goal:** Load IMDb datasets, validate data quality, and create initial visualizations.

**Outputs:**
- Master dataset cached to parquet
- Movie count by year
- Rating distribution
- Vote count distribution

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_imdb_datasets, merge_master_dataset, validate_master_dataset

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load IMDb Datasets

Load title basics (metadata) and ratings from IMDb official datasets.

In [None]:
print("Loading IMDb datasets...")
basics, ratings = load_imdb_datasets()

print(f"\nBasics: {len(basics):,} movies")
print(f"Ratings: {len(ratings):,} titles with ratings")

## 2. Merge into Master Dataset

Combine basics and ratings, apply minimum vote threshold to filter for movies with substantial audience engagement.

In [None]:
# Start with 1,000 vote minimum (can adjust later)
MIN_VOTES = 1000

print(f"Merging datasets with min_votes={MIN_VOTES:,}...")
master = merge_master_dataset(basics, ratings, min_votes=MIN_VOTES)

print(f"\nMaster dataset: {len(master):,} movies")
master.head(10)

## 3. Validate Data Quality

In [None]:
validation_results = validate_master_dataset(master)

print("\n=== Validation Summary ===")
for key, value in validation_results.items():
    print(f"{key:20s}: {value}")

## 4. Exploratory Visualizations

### 4.1 Movie Count by Year

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

# Count movies by year
year_counts = master['year'].value_counts().sort_index()

# Plot from 1950 onwards for clarity
year_counts_recent = year_counts[year_counts.index >= 1950]

ax.bar(year_counts_recent.index, year_counts_recent.values, color='steelblue', alpha=0.7)
ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Number of Movies', fontsize=12)
ax.set_title(f'Movie Count by Year (≥{MIN_VOTES:,} votes)', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Mark candidate cutoff years
cutoffs = [2000, 2008, 2012, 2018, 2020]
for cutoff in cutoffs:
    ax.axvline(cutoff, color='red', linestyle='--', alpha=0.5, linewidth=1)

plt.tight_layout()
plt.show()

print(f"\nPeak year: {year_counts.idxmax()} ({year_counts.max():,} movies)")

### 4.2 Rating Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(master['imdb_rating'], bins=50, color='coral', alpha=0.7, edgecolor='black')
axes[0].axvline(master['imdb_rating'].mean(), color='red', linestyle='--', 
                linewidth=2, label=f'Mean: {master["imdb_rating"].mean():.2f}')
axes[0].axvline(master['imdb_rating'].median(), color='green', linestyle='--', 
                linewidth=2, label=f'Median: {master["imdb_rating"].median():.2f}')
axes[0].set_xlabel('IMDb Rating', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Rating Distribution', fontsize=13, fontweight='bold')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Box plot
axes[1].boxplot(master['imdb_rating'], vert=True, patch_artist=True,
                boxprops=dict(facecolor='lightblue', alpha=0.7))
axes[1].set_ylabel('IMDb Rating', fontsize=12)
axes[1].set_title('Rating Box Plot', fontsize=13, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nRating statistics:")
print(master['imdb_rating'].describe())

### 4.3 Vote Count Distribution (Log Scale)

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

ax.hist(np.log10(master['num_votes']), bins=50, color='mediumpurple', alpha=0.7, edgecolor='black')
ax.set_xlabel('Log10(Number of Votes)', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Vote Count Distribution (Log Scale)', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Add reference lines
for threshold in [1000, 10000, 100000]:
    ax.axvline(np.log10(threshold), color='red', linestyle='--', alpha=0.4, linewidth=1)
    ax.text(np.log10(threshold), ax.get_ylim()[1]*0.9, f'{threshold:,}', 
            ha='center', fontsize=9, color='red')

plt.tight_layout()
plt.show()

print(f"\nVote count statistics:")
print(master['num_votes'].describe())

### 4.4 Rating Over Time

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

# Group by year and compute mean rating
yearly_ratings = master.groupby('year')['imdb_rating'].agg(['mean', 'std', 'count'])
yearly_ratings_recent = yearly_ratings[yearly_ratings.index >= 1950]

ax.plot(yearly_ratings_recent.index, yearly_ratings_recent['mean'], 
        color='darkblue', linewidth=2, label='Mean Rating')
ax.fill_between(yearly_ratings_recent.index, 
                yearly_ratings_recent['mean'] - yearly_ratings_recent['std'],
                yearly_ratings_recent['mean'] + yearly_ratings_recent['std'],
                alpha=0.2, color='blue', label='±1 Std Dev')

ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('IMDb Rating', fontsize=12)
ax.set_title('Average IMDb Rating Over Time', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# Mark candidate cutoff years
cutoffs = [2000, 2008, 2012, 2018, 2020]
for cutoff in cutoffs:
    ax.axvline(cutoff, color='red', linestyle='--', alpha=0.5, linewidth=1)

plt.tight_layout()
plt.show()

## 5. Genre Analysis

In [None]:
# Explode genres list into separate rows
genre_exploded = master.explode('genres')
genre_counts = genre_exploded['genres'].value_counts().head(15)

fig, ax = plt.subplots(figsize=(12, 6))
genre_counts.plot(kind='barh', ax=ax, color='teal', alpha=0.7)
ax.set_xlabel('Count', fontsize=12)
ax.set_ylabel('Genre', fontsize=12)
ax.set_title('Top 15 Most Common Genres', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTotal unique genres: {genre_exploded['genres'].nunique()}")

## 6. Sample High-Rated and Low-Rated Movies

In [None]:
print("=== Top 10 Highest-Rated Movies ===")
top_movies = master.nlargest(10, 'imdb_rating')[['title', 'year', 'imdb_rating', 'num_votes']]
print(top_movies.to_string(index=False))

print("\n=== Top 10 Lowest-Rated Movies ===")
bottom_movies = master.nsmallest(10, 'imdb_rating')[['title', 'year', 'imdb_rating', 'num_votes']]
print(bottom_movies.to_string(index=False))

## 7. Summary Statistics

In [None]:
print("=== Master Dataset Summary ===")
print(f"Total movies: {len(master):,}")
print(f"Year range: {master['year'].min():.0f} - {master['year'].max():.0f}")
print(f"Rating range: {master['imdb_rating'].min():.1f} - {master['imdb_rating'].max():.1f}")
print(f"Median rating: {master['imdb_rating'].median():.2f}")
print(f"Mean votes: {master['num_votes'].mean():,.0f}")
print(f"Median votes: {master['num_votes'].median():,.0f}")
print(f"\nMovies by decade:")
master['decade'] = (master['year'] // 10) * 10
decade_counts = master['decade'].value_counts().sort_index()
for decade, count in decade_counts.items():
    if decade >= 1950:
        print(f"  {decade:.0f}s: {count:,}")

## Next Steps

1. **Notebook 02**: Develop quality metrics (critical acclaim, legacy, technical)
2. **Notebook 03**: Test candidate cutoff years with statistical tests
3. **Notebook 04**: Analyze historical list composition
4. **Notebook 05**: Generate final visualizations and publish findings