# FDA Food Adverse Events - Data Exploration

This notebook explores the FDA openFDA food adverse event reports dataset to understand:
- Dataset structure and quality
- Temporal patterns in adverse event reports
- Most common reactions and outcomes
- Consumer demographics
- Product categories involved

**Goal**: Identify patterns that will inform our anomaly detection approach for outbreak identification.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [None]:
# Import our custom data loader
import sys
sys.path.append('../src')
from data_loader import FDADataLoader

## 1. Load the Data

We'll start by loading a subset of the data to explore its structure. The full dataset contains 2.6M+ records, so we'll initially work with a manageable sample.

In [None]:
# Initialize data loader
data_path = '../data/raw/food-event-0001-of-0001.json'
loader = FDADataLoader(data_path)

# Load sample of data (adjust max_records as needed)
print("Loading data sample...")
df = loader.load_to_dataframe(max_records=50000)
print(f"Loaded {len(df):,} records")

In [None]:
# Display basic information
print("Dataset shape:", df.shape)
print("\nColumn names and types:")
df.info()

In [None]:
# View first few records
df.head(10)

## 2. Data Quality Assessment

In [None]:
# Check for missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing_Percentage', ascending=False)

print("Missing Data Summary:")
print(missing_data[missing_data['Missing_Count'] > 0])

In [None]:
# Visualize missing data
plt.figure(figsize=(12, 6))
missing_cols = missing_data[missing_data['Missing_Percentage'] > 0].head(15)
plt.barh(missing_cols['Column'], missing_cols['Missing_Percentage'])
plt.xlabel('Missing Percentage (%)')
plt.title('Missing Data by Column (Top 15)')
plt.tight_layout()
plt.show()

In [None]:
# Check for duplicate report numbers
duplicates = df['report_number'].duplicated().sum()
print(f"Duplicate report numbers: {duplicates:,}")
print(f"Unique reports: {df['report_number'].nunique():,}")

## 3. Temporal Analysis

Understanding the time distribution of reports is crucial for outbreak detection.

In [None]:
# Convert date_started to datetime
df['date_started'] = pd.to_datetime(df['date_started'], errors='coerce')

# Extract temporal features
df['year'] = df['date_started'].dt.year
df['month'] = df['date_started'].dt.month
df['day_of_week'] = df['date_started'].dt.dayofweek
df['quarter'] = df['date_started'].dt.quarter

print("Date range:")
print(f"Earliest: {df['date_started'].min()}")
print(f"Latest: {df['date_started'].max()}")
print(f"Missing dates: {df['date_started'].isna().sum():,}")

In [None]:
# Plot reports over time
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Daily counts
daily_counts = df.groupby(df['date_started'].dt.date).size()
axes[0].plot(daily_counts.index, daily_counts.values, alpha=0.7)
axes[0].set_title('Daily Adverse Event Reports', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Number of Reports')
axes[0].grid(True, alpha=0.3)

# Monthly counts
monthly_counts = df.groupby(df['date_started'].dt.to_period('M')).size()
monthly_counts.index = monthly_counts.index.to_timestamp()
axes[1].plot(monthly_counts.index, monthly_counts.values, marker='o', linewidth=2)
axes[1].set_title('Monthly Adverse Event Reports', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Number of Reports')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Yearly trend
yearly_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
plt.bar(yearly_counts.index, yearly_counts.values, color='steelblue')
plt.title('Adverse Event Reports by Year', fontsize=14, fontweight='bold')
plt.xlabel('Year')
plt.ylabel('Number of Reports')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print("\nYearly Report Counts:")
print(yearly_counts)

## 4. Reaction Analysis

Identifying the most common reactions will help us focus our anomaly detection efforts.

In [None]:
# Count reactions (handling lists)
from collections import Counter

all_reactions = []
for reactions in df['reactions'].dropna():
    if isinstance(reactions, list):
        all_reactions.extend(reactions)
    elif isinstance(reactions, str):
        all_reactions.append(reactions)

reaction_counts = Counter(all_reactions)
top_reactions = pd.DataFrame(reaction_counts.most_common(30), 
                             columns=['Reaction', 'Count'])

print(f"Total unique reactions: {len(reaction_counts):,}")
print(f"Total reaction occurrences: {len(all_reactions):,}")
print("\nTop 30 Most Common Reactions:")
print(top_reactions)

In [None]:
# Visualize top reactions
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Top 20 reactions
top_20 = top_reactions.head(20)
axes[0].barh(range(len(top_20)), top_20['Count'], color='coral')
axes[0].set_yticks(range(len(top_20)))
axes[0].set_yticklabels(top_20['Reaction'])
axes[0].invert_yaxis()
axes[0].set_xlabel('Number of Reports')
axes[0].set_title('Top 20 Most Common Adverse Reactions', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='x')

# Top 10 with percentages
top_10 = top_reactions.head(10).copy()
top_10['Percentage'] = (top_10['Count'] / len(all_reactions) * 100).round(2)
colors = plt.cm.Set3(range(len(top_10)))
axes[1].pie(top_10['Count'], labels=top_10['Reaction'], autopct='%1.1f%%',
           startangle=90, colors=colors)
axes[1].set_title('Top 10 Reactions - Proportion', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Outcome Analysis

In [None]:
# Analyze outcomes
all_outcomes = []
for outcomes in df['outcomes'].dropna():
    if isinstance(outcomes, list):
        all_outcomes.extend(outcomes)
    elif isinstance(outcomes, str):
        all_outcomes.append(outcomes)

outcome_counts = Counter(all_outcomes)
outcome_df = pd.DataFrame(outcome_counts.most_common(), 
                         columns=['Outcome', 'Count'])
outcome_df['Percentage'] = (outcome_df['Count'] / len(all_outcomes) * 100).round(2)

print("Outcome Distribution:")
print(outcome_df)

In [None]:
# Visualize outcomes
plt.figure(figsize=(10, 6))
plt.bar(outcome_df['Outcome'], outcome_df['Count'], color='teal')
plt.title('Distribution of Adverse Event Outcomes', fontsize=14, fontweight='bold')
plt.xlabel('Outcome Type')
plt.ylabel('Number of Occurrences')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 6. Consumer Demographics

In [None]:
# Age analysis
age_data = df['consumer_age'].dropna()

print(f"Age statistics:")
print(f"Mean: {age_data.mean():.1f} years")
print(f"Median: {age_data.median():.1f} years")
print(f"Min: {age_data.min():.1f} years")
print(f"Max: {age_data.max():.1f} years")
print(f"Std: {age_data.std():.1f} years")

In [None]:
# Age distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(age_data, bins=50, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Age (years)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Age Distribution of Consumers', fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')

# Box plot
axes[1].boxplot(age_data, vert=True)
axes[1].set_ylabel('Age (years)')
axes[1].set_title('Age Distribution - Box Plot', fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# Gender distribution
gender_counts = df['consumer_gender'].value_counts()

plt.figure(figsize=(8, 6))
plt.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%',
       startangle=90, colors=['lightcoral', 'lightskyblue', 'lightgray'])
plt.title('Gender Distribution of Consumers', fontsize=14, fontweight='bold')
plt.show()

print("\nGender Counts:")
print(gender_counts)

## 7. Product Analysis

In [None]:
# Top products/brands involved
products = df['products'].dropna()
product_list = []

for prod in products:
    if isinstance(prod, list):
        product_list.extend(prod)
    elif isinstance(prod, str):
        product_list.append(prod)

product_counts = Counter(product_list)
top_products = pd.DataFrame(product_counts.most_common(20),
                           columns=['Product', 'Count'])

print(f"Unique products: {len(product_counts):,}")
print("\nTop 20 Products:")
print(top_products)

In [None]:
# Visualize top products
plt.figure(figsize=(12, 8))
plt.barh(range(len(top_products)), top_products['Count'], color='mediumseagreen')
plt.yticks(range(len(top_products)), top_products['Product'])
plt.gca().invert_yaxis()
plt.xlabel('Number of Reports')
plt.title('Top 20 Products in Adverse Event Reports', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 8. Time Series by Reaction Type

This is crucial for outbreak detection - we want to see how specific reactions trend over time.

In [None]:
# Create a function to plot time series for specific reactions
def plot_reaction_timeseries(df, reactions, freq='W'):
    """
    Plot time series for specific reactions
    
    Args:
        df: DataFrame with 'date_started' and 'reactions'
        reactions: List of reaction names to plot
        freq: Frequency for aggregation ('D'=daily, 'W'=weekly, 'M'=monthly)
    """
    fig, axes = plt.subplots(len(reactions), 1, figsize=(15, 4*len(reactions)))
    if len(reactions) == 1:
        axes = [axes]
    
    for idx, reaction in enumerate(reactions):
        # Filter for this reaction
        reaction_mask = df['reactions'].apply(
            lambda x: reaction in x if isinstance(x, list) else reaction == x
        )
        reaction_df = df[reaction_mask].copy()
        
        # Aggregate by time period
        ts = reaction_df.groupby(pd.Grouper(key='date_started', freq=freq)).size()
        
        # Plot
        axes[idx].plot(ts.index, ts.values, linewidth=2)
        axes[idx].set_title(f'Time Series: {reaction}', fontweight='bold', fontsize=12)
        axes[idx].set_ylabel('Count')
        axes[idx].grid(True, alpha=0.3)
        axes[idx].fill_between(ts.index, ts.values, alpha=0.3)
    
    axes[-1].set_xlabel('Date')
    plt.tight_layout()
    plt.show()

# Plot top 5 reactions
top_5_reactions = top_reactions.head(5)['Reaction'].tolist()
plot_reaction_timeseries(df, top_5_reactions, freq='W')

## 9. Key Insights Summary

In [None]:
print("=" * 80)
print("KEY INSIGHTS FROM EXPLORATION")
print("=" * 80)
print(f"\n1. DATASET SIZE")
print(f"   - Total records analyzed: {len(df):,}")
print(f"   - Date range: {df['date_started'].min()} to {df['date_started'].max()}")
print(f"   - Unique reports: {df['report_number'].nunique():,}")

print(f"\n2. REACTIONS")
print(f"   - Unique reaction types: {len(reaction_counts):,}")
print(f"   - Most common: {top_reactions.iloc[0]['Reaction']} ({top_reactions.iloc[0]['Count']:,} reports)")
print(f"   - Top 5 reactions account for significant portion of reports")

print(f"\n3. TEMPORAL PATTERNS")
print(f"   - Reports appear to have seasonal/temporal variation")
print(f"   - Some spikes visible in time series plots")
print(f"   - Further investigation needed for anomaly detection")

print(f"\n4. OUTCOMES")
if len(outcome_df) > 0:
    print(f"   - Most common outcome: {outcome_df.iloc[0]['Outcome']} ({outcome_df.iloc[0]['Percentage']:.1f}%)")
    print(f"   - Serious outcomes present in dataset")

print(f"\n5. NEXT STEPS")
print(f"   - Implement data preprocessing pipeline")
print(f"   - Focus on top 10-20 reactions for anomaly detection")
print(f"   - Develop baseline statistical models")
print(f"   - Implement machine learning anomaly detectors")
print("\n" + "=" * 80)

## 10. Save Processed Data (Optional)

Save the explored dataset for use in subsequent notebooks.

In [None]:
# Save to processed data folder
output_path = '../data/processed/explored_sample.parquet'
df.to_parquet(output_path, compression='snappy', index=False)
print(f"Saved explored data to: {output_path}")
print(f"File size: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")