# Phase 1: Data Exploration - INFORM Severity Index

This notebook performs Phase 1 exploration of the INFORM Severity Index crisis-level data.

## Steps:
1. Load and Inspect Data
2. Understand Time Structure
3. Create Date Column

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## Step 1.1: Load and Inspect Data

In [None]:
# Load your data
df = pd.read_csv("inform_severity_combined.csv")

# Basic inspection
print(f"Total rows: {len(df)}")
print(f"Date range: {df['year'].min()}-{df['year'].max()}")
print(f"\nColumns:")
print(df.columns.tolist())

# Check for duplicates
print(f"\nUnique crises: {df['CRISIS ID'].nunique()}")
print(f"Unique countries: {df['ISO3'].nunique()}")

# First few rows
print("\nFirst few rows:")
print(df.head())

In [None]:
# Check for missing values in INFORM Severity Index
print("Missing values in INFORM Severity Index:")
print(df['INFORM Severity Index'].isna().sum())
print(f"Percentage missing: {df['INFORM Severity Index'].isna().sum() / len(df) * 100:.2f}%")

# Summary statistics
print("\nSummary statistics for INFORM Severity Index:")
print(df['INFORM Severity Index'].describe())

## Step 1.2: Understand Time Structure

In [None]:
# Check temporal coverage per crisis
crisis_counts = df.groupby('CRISIS ID').size().reset_index(name='n_observations')

print(f"Crises with 1 observation: {(crisis_counts['n_observations'] == 1).sum()}")
print(f"Crises with 2+ observations: {(crisis_counts['n_observations'] >= 2).sum()}")
print(f"Crises with 5+ observations: {(crisis_counts['n_observations'] >= 5).sum()}")
print(f"Max observations for one crisis: {crisis_counts['n_observations'].max()}")

# Summary statistics
print(f"\nSummary of observations per crisis:")
print(crisis_counts['n_observations'].describe())

# Distribution
print("\n" + "="*60)
print("TEMPORAL COVERAGE HISTOGRAM")
print("="*60)
print("This histogram shows: HOW MANY MONTHLY OBSERVATIONS each crisis has")
print("X-axis: Number of monthly snapshots per crisis")
print("Y-axis: Number of crises with that many observations")
print("="*60)

plt.figure(figsize=(10,6))
plt.hist(crisis_counts['n_observations'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Number of monthly observations per crisis', fontsize=12)
plt.ylabel('Number of crises', fontsize=12)
plt.title('Temporal Coverage Distribution\n(How many time points does each crisis have?)', fontsize=14)
plt.grid(True, alpha=0.3)
plt.savefig("person2_temporal_coverage.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Show some examples of crises with multiple observations
print("Examples of crises with multiple time points:")
multi_obs_crises = crisis_counts[crisis_counts['n_observations'] >= 5].head(10)
print(multi_obs_crises)

# Show a sample crisis with multiple observations
if len(multi_obs_crises) > 0:
    sample_crisis_id = multi_obs_crises.iloc[0]['CRISIS ID']
    print(f"\nSample crisis {sample_crisis_id} over time:")
    sample_crisis = df[df['CRISIS ID'] == sample_crisis_id][['CRISIS ID', 'COUNTRY', 'month', 'year', 'INFORM Severity Index']].sort_values(['year', 'month'])
    print(sample_crisis)

## Step 1.3: Create Date Column

In [None]:
# Convert month name to number
month_map = {
    'january': 1, 'february': 2, 'march': 3, 'april': 4,
    'may': 5, 'june': 6, 'july': 7, 'august': 8,
    'september': 9, 'october': 10, 'november': 11, 'december': 12
}

# Handle case-insensitive month names and strip whitespace
df['month_clean'] = df['month'].astype(str).str.lower().str.strip()

# Handle special month values
# 'inform_severity_mid_december' -> december (12)
# 'late_november' -> november (11)
df['month_clean'] = df['month_clean'].replace({
    'inform_severity_mid_december': 'december',
    'late_november': 'november'
})

# Map to month numbers
df['month_num'] = df['month_clean'].map(month_map)

# Check for any unmapped months
unmapped = df[df['month_num'].isna()]['month'].unique()
if len(unmapped) > 0:
    print(f"Warning: Found unmapped month values: {unmapped}")
    print("These rows will have NaN for month_num")
    print(f"Count of rows with unmapped months: {df['month_num'].isna().sum()}")

# Convert year to numeric, handling 'unknown_year' and other invalid values
df['year_clean'] = pd.to_numeric(df['year'], errors='coerce')

# Create proper date - only for rows with valid month_num and valid year
# For rows with missing month_num or invalid year, date will be NaN
df['date'] = pd.NaT
valid_mask = df['month_num'].notna() & df['year_clean'].notna()
df.loc[valid_mask, 'date'] = pd.to_datetime(
    {
        'year': df.loc[valid_mask, 'year_clean'],
        'month': df.loc[valid_mask, 'month_num'],
        'day': 1
    },
    errors='coerce'
)

# Sort by crisis and date
df = df.sort_values(['CRISIS ID', 'date'])

print("Sample data with date column:")
print(df[['CRISIS ID', 'COUNTRY', 'month', 'year', 'date', 'INFORM Severity Index']].head(20))

In [None]:
# Check date range
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Total unique dates: {df['date'].nunique()}")

# Check for any invalid dates
invalid_dates = df[df['date'].isna()]
if len(invalid_dates) > 0:
    print(f"\nWarning: {len(invalid_dates)} rows have invalid dates")
    print("Sample of rows with invalid dates:")
    print(invalid_dates[['CRISIS ID', 'COUNTRY', 'month', 'year']].head())