# FDA Food Adverse Events - Data Preprocessing

This notebook handles data cleaning, transformation, and feature engineering to prepare the dataset for anomaly detection.

**Goals**:
1. Clean and standardize the data
2. Handle missing values appropriately
3. Create time series aggregations by reaction type
4. Engineer features for anomaly detection
5. Prepare train/test splits for modeling

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

pd.set_option('display.max_columns', None)

In [None]:
# Import custom modules
import sys
sys.path.append('../src')
from data_loader import FDADataLoader

## 1. Load Data

Load the full dataset (or a large sample) for preprocessing.

In [None]:
# Load data
data_path = '../data/raw/food-event-0001-of-0001.json'
loader = FDADataLoader(data_path)

# Adjust max_records based on your memory constraints
# Use None to load all data
print("Loading data...")
df = loader.load_to_dataframe(max_records=100000)  # Adjust as needed
print(f"Loaded {len(df):,} records")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 2. Data Cleaning

In [None]:
# Remove duplicate report numbers
print(f"Records before deduplication: {len(df):,}")
df = df.drop_duplicates(subset=['report_number'], keep='first')
print(f"Records after deduplication: {len(df):,}")

In [None]:
# Clean date column
df['date_started'] = pd.to_datetime(df['date_started'], errors='coerce')

# Remove records with missing dates (critical for time series)
print(f"Records before removing null dates: {len(df):,}")
df = df.dropna(subset=['date_started'])
print(f"Records after removing null dates: {len(df):,}")

In [None]:
# Remove outlier dates (future dates or very old dates)
current_date = pd.Timestamp.now()
min_date = pd.Timestamp('2000-01-01')  # Reasonable cutoff

print(f"Date range before filtering: {df['date_started'].min()} to {df['date_started'].max()}")
df = df[(df['date_started'] >= min_date) & (df['date_started'] <= current_date)]
print(f"Date range after filtering: {df['date_started'].min()} to {df['date_started'].max()}")
print(f"Records after date filtering: {len(df):,}")

In [None]:
# Clean age data
# Remove unrealistic ages
df.loc[(df['consumer_age'] < 0) | (df['consumer_age'] > 120), 'consumer_age'] = np.nan

print(f"Age range after cleaning: {df['consumer_age'].min():.1f} to {df['consumer_age'].max():.1f}")
print(f"Records with valid age: {df['consumer_age'].notna().sum():,}")

In [None]:
# Standardize reactions (ensure all are lists)
def standardize_reactions(reactions):
    if pd.isna(reactions):
        return []
    if isinstance(reactions, list):
        return [r.strip().lower() for r in reactions if r]
    if isinstance(reactions, str):
        return [reactions.strip().lower()]
    return []

df['reactions_clean'] = df['reactions'].apply(standardize_reactions)

# Remove records with no reactions
print(f"Records before removing empty reactions: {len(df):,}")
df = df[df['reactions_clean'].apply(len) > 0]
print(f"Records after removing empty reactions: {len(df):,}")

## 3. Feature Engineering

In [None]:
# Extract temporal features
df['year'] = df['date_started'].dt.year
df['month'] = df['date_started'].dt.month
df['quarter'] = df['date_started'].dt.quarter
df['day_of_week'] = df['date_started'].dt.dayofweek
df['day_of_year'] = df['date_started'].dt.dayofyear
df['week_of_year'] = df['date_started'].dt.isocalendar().week

# Create date-only column for grouping
df['date'] = df['date_started'].dt.date

print("Temporal features created:")
print(df[['date_started', 'year', 'month', 'quarter', 'day_of_week', 'week_of_year']].head())

In [None]:
# Create reaction count feature
df['num_reactions'] = df['reactions_clean'].apply(len)

# Create outcome severity flags
def extract_outcome_severity(outcomes):
    if pd.isna(outcomes):
        return {'serious': False, 'hospitalization': False, 'death': False}
    
    if isinstance(outcomes, str):
        outcomes = [outcomes]
    
    outcomes_lower = [o.lower() if isinstance(o, str) else '' for o in outcomes]
    
    return {
        'serious': any('serious' in o for o in outcomes_lower),
        'hospitalization': any('hospital' in o for o in outcomes_lower),
        'death': any('death' in o for o in outcomes_lower)
    }

outcome_features = df['outcomes'].apply(extract_outcome_severity)
df['is_serious'] = outcome_features.apply(lambda x: x['serious'])
df['is_hospitalization'] = outcome_features.apply(lambda x: x['hospitalization'])
df['is_death'] = outcome_features.apply(lambda x: x['death'])

print(f"\nOutcome severity distribution:")
print(f"Serious outcomes: {df['is_serious'].sum():,} ({df['is_serious'].mean()*100:.1f}%)")
print(f"Hospitalizations: {df['is_hospitalization'].sum():,} ({df['is_hospitalization'].mean()*100:.1f}%)")
print(f"Deaths: {df['is_death'].sum():,} ({df['is_death'].mean()*100:.1f}%)")

## 4. Create Time Series by Reaction Type

This is the core dataset for anomaly detection - we'll create time series for each reaction type.

In [None]:
# Identify top reactions to focus on
all_reactions = []
for reactions in df['reactions_clean']:
    all_reactions.extend(reactions)

reaction_counts = Counter(all_reactions)
top_n = 20  # Focus on top 20 reactions
top_reactions = [r for r, _ in reaction_counts.most_common(top_n)]

print(f"Focusing on top {top_n} reactions:")
for i, (reaction, count) in enumerate(reaction_counts.most_common(top_n), 1):
    print(f"{i:2d}. {reaction:40s} {count:8,}")

In [None]:
# Create time series for each reaction at different frequencies
def create_reaction_timeseries(df, reactions, freq='D'):
    """
    Create time series dataframe for specified reactions
    
    Args:
        df: Source dataframe
        reactions: List of reaction names
        freq: Frequency ('D'=daily, 'W'=weekly, 'M'=monthly)
    
    Returns:
        DataFrame with date index and columns for each reaction
    """
    # Create a complete date range
    date_range = pd.date_range(
        start=df['date_started'].min(),
        end=df['date_started'].max(),
        freq=freq
    )
    
    # Initialize result dataframe
    result = pd.DataFrame(index=date_range)
    
    # Add count for each reaction
    for reaction in reactions:
        # Filter records containing this reaction
        mask = df['reactions_clean'].apply(lambda x: reaction in x)
        reaction_df = df[mask].copy()
        
        # Aggregate by time period
        counts = reaction_df.groupby(pd.Grouper(key='date_started', freq=freq)).size()
        
        # Add to result, filling missing dates with 0
        result[reaction] = counts.reindex(date_range, fill_value=0)
    
    return result

# Create daily time series
print("Creating daily time series...")
ts_daily = create_reaction_timeseries(df, top_reactions, freq='D')
print(f"Shape: {ts_daily.shape}")
print(f"Date range: {ts_daily.index.min()} to {ts_daily.index.max()}")
print(f"\nSample:")
print(ts_daily.head())

In [None]:
# Create weekly time series (often better for outbreak detection)
print("Creating weekly time series...")
ts_weekly = create_reaction_timeseries(df, top_reactions, freq='W')
print(f"Shape: {ts_weekly.shape}")
print(f"Date range: {ts_weekly.index.min()} to {ts_weekly.index.max()}")
print(f"\nSample:")
print(ts_weekly.head())

In [None]:
# Visualize time series for top 5 reactions
fig, axes = plt.subplots(5, 1, figsize=(15, 15))

for idx, reaction in enumerate(top_reactions[:5]):
    axes[idx].plot(ts_weekly.index, ts_weekly[reaction], linewidth=2)
    axes[idx].set_title(f'Weekly Time Series: {reaction.title()}', fontweight='bold')
    axes[idx].set_ylabel('Count')
    axes[idx].grid(True, alpha=0.3)
    axes[idx].fill_between(ts_weekly.index, ts_weekly[reaction], alpha=0.3)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.show()

## 5. Statistical Features for Time Series

Calculate rolling statistics that will be useful for anomaly detection.

In [None]:
def add_rolling_features(ts_df, window=7):
    """
    Add rolling statistical features to time series
    
    Args:
        ts_df: Time series dataframe
        window: Rolling window size
    
    Returns:
        DataFrame with additional feature columns
    """
    result = ts_df.copy()
    
    for col in ts_df.columns:
        # Rolling mean
        result[f'{col}_rolling_mean'] = ts_df[col].rolling(window=window, min_periods=1).mean()
        
        # Rolling std
        result[f'{col}_rolling_std'] = ts_df[col].rolling(window=window, min_periods=1).std()
        
        # Z-score (standardized value)
        mean = result[f'{col}_rolling_mean']
        std = result[f'{col}_rolling_std']
        result[f'{col}_zscore'] = (ts_df[col] - mean) / (std + 1e-10)  # Add small value to avoid division by zero
    
    return result

# Add features to daily time series
print("Adding rolling features to daily time series...")
ts_daily_featured = add_rolling_features(ts_daily, window=7)
print(f"Original columns: {len(ts_daily.columns)}")
print(f"With features: {len(ts_daily_featured.columns)}")

In [None]:
# Visualize rolling statistics for one reaction
reaction_example = top_reactions[0]

fig, axes = plt.subplots(3, 1, figsize=(15, 10))

# Original count
axes[0].plot(ts_daily.index, ts_daily[reaction_example], label='Actual Count', alpha=0.6)
axes[0].plot(ts_daily_featured.index, ts_daily_featured[f'{reaction_example}_rolling_mean'], 
            label='7-day Rolling Mean', linewidth=2, color='red')
axes[0].set_title(f'{reaction_example.title()} - Count with Rolling Mean', fontweight='bold')
axes[0].set_ylabel('Count')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Rolling standard deviation
axes[1].plot(ts_daily_featured.index, ts_daily_featured[f'{reaction_example}_rolling_std'], 
            color='orange', linewidth=2)
axes[1].set_title(f'{reaction_example.title()} - Rolling Standard Deviation', fontweight='bold')
axes[1].set_ylabel('Std Dev')
axes[1].grid(True, alpha=0.3)

# Z-score
axes[2].plot(ts_daily_featured.index, ts_daily_featured[f'{reaction_example}_zscore'], 
            color='purple', linewidth=1.5)
axes[2].axhline(y=3, color='red', linestyle='--', label='3σ threshold')
axes[2].axhline(y=-3, color='red', linestyle='--')
axes[2].axhline(y=0, color='gray', linestyle='-', alpha=0.3)
axes[2].set_title(f'{reaction_example.title()} - Z-Score', fontweight='bold')
axes[2].set_ylabel('Z-Score')
axes[2].set_xlabel('Date')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Train/Test Split

For time series, we use temporal splits to avoid data leakage.

In [None]:
# Split by date - use last 20% for testing
split_date = ts_daily.index[int(len(ts_daily) * 0.8)]

# Daily splits
ts_daily_train = ts_daily[ts_daily.index < split_date]
ts_daily_test = ts_daily[ts_daily.index >= split_date]

# Weekly splits
split_date_weekly = ts_weekly.index[int(len(ts_weekly) * 0.8)]
ts_weekly_train = ts_weekly[ts_weekly.index < split_date_weekly]
ts_weekly_test = ts_weekly[ts_weekly.index >= split_date_weekly]

print("Daily time series split:")
print(f"  Train: {ts_daily_train.index.min()} to {ts_daily_train.index.max()} ({len(ts_daily_train)} records)")
print(f"  Test:  {ts_daily_test.index.min()} to {ts_daily_test.index.max()} ({len(ts_daily_test)} records)")

print("\nWeekly time series split:")
print(f"  Train: {ts_weekly_train.index.min()} to {ts_weekly_train.index.max()} ({len(ts_weekly_train)} records)")
print(f"  Test:  {ts_weekly_test.index.min()} to {ts_weekly_test.index.max()} ({len(ts_weekly_test)} records)")

## 7. Save Preprocessed Data

In [None]:
import os

# Create processed data directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save cleaned raw data
df.to_parquet('../data/processed/cleaned_data.parquet', compression='snappy', index=False)
print("Saved: cleaned_data.parquet")

# Save time series data
ts_daily.to_parquet('../data/processed/timeseries_daily.parquet')
print("Saved: timeseries_daily.parquet")

ts_weekly.to_parquet('../data/processed/timeseries_weekly.parquet')
print("Saved: timeseries_weekly.parquet")

# Save train/test splits
ts_daily_train.to_parquet('../data/processed/timeseries_daily_train.parquet')
ts_daily_test.to_parquet('../data/processed/timeseries_daily_test.parquet')
print("Saved: daily train/test splits")

ts_weekly_train.to_parquet('../data/processed/timeseries_weekly_train.parquet')
ts_weekly_test.to_parquet('../data/processed/timeseries_weekly_test.parquet')
print("Saved: weekly train/test splits")

# Save list of top reactions for reference
import json
with open('../data/processed/top_reactions.json', 'w') as f:
    json.dump(top_reactions, f, indent=2)
print("Saved: top_reactions.json")

print("\n✓ All preprocessed data saved successfully!")

## 8. Summary Statistics

In [None]:
print("=" * 80)
print("PREPROCESSING SUMMARY")
print("=" * 80)
print(f"\nCleaned Dataset:")
print(f"  Total records: {len(df):,}")
print(f"  Date range: {df['date_started'].min()} to {df['date_started'].max()}")
print(f"  Unique reactions: {len(reaction_counts):,}")
print(f"  Focus reactions: {len(top_reactions)}")

print(f"\nTime Series Data:")
print(f"  Daily time series: {ts_daily.shape}")
print(f"  Weekly time series: {ts_weekly.shape}")
print(f"  Daily train/test: {ts_daily_train.shape} / {ts_daily_test.shape}")
print(f"  Weekly train/test: {ts_weekly_train.shape} / {ts_weekly_test.shape}")

print(f"\nNext Steps:")
print("  1. Implement baseline anomaly detection (Z-score, moving average)")
print("  2. Train Isolation Forest models")
print("  3. Implement ARIMA/Prophet models")
print("  4. Compare model performance")
print("  5. Validate against known FDA recalls")
print("\n" + "=" * 80)