# TMDB Data Cleaning & Exploration

This notebook handles the cleaning and preprocessing of raw TMDB movie data.

## Objectives
1. Load raw JSON data
2. Flatten nested JSON columns
3. Clean data types and handle missing values
4. Feature engineering
5. Export cleaned data

In [None]:
import pandas as pd
import numpy as np
import json
import os
import sys
from pathlib import Path
import yaml
import ast

# Add project root to path to access src
sys.path.append('..')
from src.utils.helpers import setup_logging

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Setup Logging
logger = setup_logging(config_path='../config/config.yaml', module_name='notebook_cleaning')

## 1. Load Configuration & Data

Load file paths from config and read raw JSON files.

In [None]:
logger.info("Starting data cleaning process...")

# Load config
try:
    with open('../config/config.yaml', 'r') as f:
        config = yaml.safe_load(f)
    raw_path = Path('..') / config['paths']['raw_data']
    logger.info(f"Raw data path: {raw_path}")
except FileNotFoundError:
    # Fallback if running from notebook dir without parent context
    raw_path = Path('../data/raw')
    logger.warning(f"Config not found, using default path: {raw_path}")

# Load JSON files
data_list = []
if raw_path.exists():
    json_files = list(raw_path.glob('*.json'))
    logger.info(f"Found {len(json_files)} JSON files")
    
    for file in json_files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                data_list.append(data)
        except Exception as e:
            logger.error(f"Error reading {file}: {e}")
else:
    logger.error("Raw data directory does not exist!")

df = pd.DataFrame(data_list)
logger.info(f"Initial DataFrame shape: {df.shape}")
df.head()

## 2. Data Cleaning

### 2.1 Drop Irrelevant Columns

In [None]:
cols_to_drop = ['adult', 'imdb_id', 'original_title', 'video', 'homepage']
existing_cols = [col for col in cols_to_drop if col in df.columns]
df_clean = df.drop(columns=existing_cols).copy()
logger.info(f"Dropped columns: {existing_cols}")
logger.info(f"New shape: {df_clean.shape}")

### 2.2 Flatten Nested Columns

Extract data from: `belongs_to_collection`, `genres`, `production_countries`, `production_companies`, `spoken_languages`.

In [None]:
def extract_name(data):
    """Extract single name from dict."""
    if isinstance(data, dict):
        return data.get('name')
    return np.nan

def extract_names_list(data, key='name', separator='|'):
    """Extract list of names from list of dicts."""
    if isinstance(data, list):
        names = [item.get(key) for item in data if isinstance(item, dict) and item.get(key)]
        return separator.join(names) if names else np.nan
    return np.nan

# Apply extractions
logger.info("Flattening nested JSON columns...")
df_clean['collection_name'] = df_clean['belongs_to_collection'].apply(extract_name)
df_clean['genres'] = df_clean['genres'].apply(lambda x: extract_names_list(x))
df_clean['production_countries'] = df_clean['production_countries'].apply(lambda x: extract_names_list(x))
df_clean['production_companies'] = df_clean['production_companies'].apply(lambda x: extract_names_list(x))
df_clean['spoken_languages'] = df_clean['spoken_languages'].apply(lambda x: extract_names_list(x))

# Inspect results
df_clean[['genres', 'collection_name']].head()

### 2.3 Handle Missing & Incorrect Data

1. Convert datatypes
2. Replace unrealistic values

In [None]:
logger.info("Cleaning datatypes...")

# Convert numeric columns
numeric_cols = ['budget', 'id', 'popularity', 'revenue', 'vote_count', 'vote_average', 'runtime']

for col in numeric_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Convert release_date
if 'release_date' in df_clean.columns:
    df_clean['release_date'] = pd.to_datetime(df_clean['release_date'], errors='coerce')

# Handle zero values in budget/revenue/runtime
for col in ['budget', 'revenue', 'runtime']:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].replace(0, np.nan)

# Create million USD columns
df_clean['budget_musd'] = df_clean['budget'] / 1_000_000
df_clean['revenue_musd'] = df_clean['revenue'] / 1_000_000

# Handle text placeholders
text_cols = ['overview', 'tagline']
placeholders = ['No Data', 'No Overview', 'n/a', 'nan']
for col in text_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].replace(placeholders, np.nan)

logger.info("Datatypes cleaned.")
df_clean.info()

### 2.4 Filtering

1. Remove duplicates
2. Drop rows with unknown id/title
3. Threshold filtering (at least 10 non-NaN)
4. Status filtering

In [None]:
logger.info("Filtering data...")

# Drop duplicates
initial_len = len(df_clean)
df_clean = df_clean.drop_duplicates(subset=['id'], keep='first')

# Drop missing ID/Title
df_clean = df_clean.dropna(subset=['id', 'title'])

# Threshold filtering (keep rows with >= 10 non-nulls)
df_clean = df_clean.dropna(thresh=10)

# Status filtering
if 'status' in df_clean.columns:
    df_clean = df_clean[df_clean['status'] == 'Released']
    df_clean = df_clean.drop(columns=['status'])

rows_removed = initial_len - len(df_clean)
logger.info(f"Rows removed: {rows_removed}")
logger.info(f"Final count: {len(df_clean)}")

## 3. Feature Engineering

In [None]:
logger.info("Performing feature engineering...")

# Extract Cast & Crew info (Basic extraction)
def extract_cast_info(credits_data):
    if isinstance(credits_data, dict):
        cast = credits_data.get('cast', [])
        crew = credits_data.get('crew', [])
        
        # Top 5 cast
        top_cast = [p.get('name') for p in cast[:5]]
        cast_str = '|'.join(top_cast) if top_cast else np.nan
        
        # Director
        director = next((p.get('name') for p in crew if p.get('job') == 'Director'), np.nan)
        
        return pd.Series([cast_str, len(cast), director, len(crew)])
    return pd.Series([np.nan, 0, np.nan, 0])

if 'credits' in df_clean.columns:
    df_clean[['cast', 'cast_size', 'director', 'crew_size']] = df_clean['credits'].apply(extract_cast_info)

# Release Year
df_clean['release_year'] = df_clean['release_date'].dt.year

# Force specific columns to string type
string_cols = ['tagline', 'title', 'collection_name']
for col in string_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype(str).replace('nan', np.nan)

### 3.1 Data Anomaly Inspection

In [None]:
# Inspect extracted columns
logger.info("Inspecting column values for anomalies...")
inspection_cols = ['genres', 'collection_name', 'production_countries', 'production_companies', 'spoken_languages']
for col in inspection_cols:
    if col in df_clean.columns:
        logger.info(f"\n--- Top 10 values for {col} ---")
        print(df_clean[col].value_counts().head(10))

## 4. Finalize & Save

In [None]:
# Reorder columns
desired_order = [
    'id', 'title', 'tagline', 'release_date', 'genres', 'collection_name', 
    'original_language', 'budget_musd', 'revenue_musd', 'production_companies', 
    'production_countries', 'vote_count', 'vote_average', 'popularity', 
    'runtime', 'overview', 'spoken_languages', 'poster_path', 
    'cast', 'cast_size', 'director', 'crew_size'
]

# Select existing columns
final_cols = [c for c in desired_order if c in df_clean.columns]
df_final = df_clean[final_cols].copy()

# Reset index
df_final = df_final.reset_index(drop=True)

logger.info(f"Final columns: {df_final.columns.tolist()}")

# Save
processed_path = Path('../data/processed')
processed_path.mkdir(parents=True, exist_ok=True)

output_file = processed_path / 'movies_cleaned.csv'
df_final.to_csv(output_file, index=False)
logger.info(f"Saved to {output_file}")

# Also save as parquet for efficient loading
df_final.to_parquet(processed_path / 'movies_cleaned.parquet', index=False)
logger.info(f"Saved to parquet.")
logger.info("Data cleaning completed successfully.")