In [16]:
# notebooks/04_forecasting.ipynb
import pandas as pd
import numpy as np
from datetime import datetime
import joblib
from sklearn.preprocessing import LabelEncoder
import os

# Create necessary directories if they don't exist
os.makedirs('../data/processed', exist_ok=True)

# Load data and models
historical_df = pd.read_csv('../data/processed/cleaned_disaster_data.csv')
model = joblib.load('../models/disaster_predictor.pkl')
location_encoder = joblib.load('../models/location_encoder.pkl')
disaster_encoder = joblib.load('../models/disaster_encoder.pkl')
season_encoder = joblib.load('../models/season_encoder.pkl')

# Try to load disaster mapping, if it doesn't exist, create it from model classes
try:
    disaster_mapping = joblib.load('../models/disaster_mapping.pkl')
    print("Loaded existing disaster mapping")
except FileNotFoundError:
    print("Disaster mapping not found, creating from model classes")
    disaster_mapping = {i: disaster_type for i, disaster_type in enumerate(model.classes_)}
    joblib.dump(disaster_mapping, '../models/disaster_mapping.pkl')
    print("Created and saved new disaster mapping")

# Check what seasons were in the training data
print("Seasons in training data:", season_encoder.classes_)
print("Model classes:", model.classes_)
print("Disaster mapping:", disaster_mapping)

# Create a manual season mapping dictionary
season_mapping = {
    'Spring': 0,
    'Summer': 1,
    'Fall': 2,
    'Winter': 3
}

# Forecasting Functions (included directly in the notebook)
def generate_future_forecast(historical_data, model, location_encoder, disaster_encoder, season_encoder, 
                            start_year=2024, end_year=2028):
    """
    Generate disaster forecasts for the next 5 years (2024-2028) for each month and location
    """
    # Get unique locations
    locations = historical_data['location'].unique()
    
    # Create future dates
    future_dates = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            future_dates.append({
                'year': year,
                'month': month,
                'day': 15,  # Mid-month representation
                'date': pd.to_datetime(f'{year}-{month}-15')
            })
    
    # Create DataFrame for all location-date combinations
    forecast_data = []
    for date_info in future_dates:
        for location in locations:
            record = {
                'location': location,
                'year': date_info['year'],
                'month': date_info['month'],
                'day': date_info['day'],
                'date': date_info['date']
            }
            forecast_data.append(record)
    
    forecast_df = pd.DataFrame(forecast_data)
    
    # Add day of week
    forecast_df['day_of_week'] = forecast_df['date'].dt.dayofweek
    
    # Apply feature engineering
    forecast_df['location_encoded'] = location_encoder.transform(forecast_df['location'])
    
    # Create season
    forecast_df['season'] = forecast_df['month'].apply(lambda x: 
        'Spring' if x in [3,4,5] else
        'Summer' if x in [6,7,8] else
        'Fall' if x in [9,10,11] else 'Winter')
    
    # Encode season using manual mapping instead of the fitted encoder
    forecast_df['season_encoded'] = forecast_df['season'].map(season_mapping)
    
    # Calculate location risk from historical data
    location_risk = historical_data.groupby('location').size() / len(historical_data)
    forecast_df['location_risk'] = forecast_df['location'].map(location_risk)
    
    # Calculate disaster frequency from historical data
    disaster_freq = historical_data.groupby('disaster_type').size() / len(historical_data)
    # Use average disaster frequency for all predictions
    avg_disaster_freq = disaster_freq.mean()
    forecast_df['disaster_freq'] = avg_disaster_freq
    
    # Prepare features for prediction
    features = ['year', 'month', 'day', 'day_of_week', 'location_encoded', 
                'season_encoded', 'location_risk', 'disaster_freq']
    
    # Make predictions
    X_forecast = forecast_df[features]
    forecast_df['disaster_encoded'] = model.predict(X_forecast)
    
    # Get prediction probabilities
    proba = model.predict_proba(X_forecast)
    
    # Use model's classes_ attribute to get disaster classes
    disaster_classes = model.classes_
    
    # Add probability columns for each disaster type
    for i, disaster in enumerate(disaster_classes):
        if i < proba.shape[1]:  # Make sure we don't exceed the array bounds
            forecast_df[f'prob_{disaster}'] = proba[:, i]
        else:
            print(f"Warning: Skipping {disaster} due to index out of bounds")
    
    # Convert encoded disaster type back to string using the disaster mapping
    # This ensures we get actual disaster type names instead of numbers
    forecast_df['disaster_type'] = forecast_df['disaster_encoded'].map(disaster_mapping)
    
    # Select relevant columns
    prob_cols = [f'prob_{disaster}' for disaster in disaster_classes if f'prob_{disaster}' in forecast_df.columns]
    result_df = forecast_df[['location', 'year', 'month', 'disaster_type'] + prob_cols]
    
    return result_df

def aggregate_forecast_by_location(forecast_df):
    """
    Aggregate forecast data by location to show most likely disasters
    """
    # Group by location and get most common disaster type
    location_summary = forecast_df.groupby('location')['disaster_type'].apply(
        lambda x: x.mode()[0] if len(x) > 0 else 'Unknown'
    ).reset_index(name='most_likely_disaster')
    
    # Calculate disaster type percentages by location
    disaster_pct = forecast_df.groupby(['location', 'disaster_type']).size() / \
                  forecast_df.groupby('location').size() * 100
    disaster_pct = disaster_pct.reset_index(name='percentage')
    
    # Pivot for easier visualization
    disaster_pivot = disaster_pct.pivot(index='location', columns='disaster_type', values='percentage').fillna(0)
    
    return location_summary, disaster_pivot

def save_forecast_results(forecast_df, location_summary, disaster_pivot, output_dir):
    """
    Save forecast results to CSV files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save results
    forecast_df.to_csv(os.path.join(output_dir, 'forecast_results.csv'), index=False)
    location_summary.to_csv(os.path.join(output_dir, 'location_forecast_summary.csv'), index=False)
    disaster_pivot.to_csv(os.path.join(output_dir, 'disaster_percentages.csv'), index=True)
    
    print("Forecast results saved successfully!")

# Generate forecast
print("Generating 5-year forecast (2024-2028)...")
forecast_df = generate_future_forecast(
    historical_df, model, location_encoder, disaster_encoder, season_encoder
)

# Aggregate results
location_summary, disaster_pivot = aggregate_forecast_by_location(forecast_df)

# Save results
save_forecast_results(forecast_df, location_summary, disaster_pivot, '../data/processed')

print("Forecast complete. Results saved.")
print(f"Generated {len(forecast_df)} forecast records")
print(f"Covering {len(forecast_df['location'].unique())} locations")
print(f"Forecast period: {forecast_df['year'].min()} to {forecast_df['year'].max()}")

# Display sample of results
print("\nSample forecast data:")
print(forecast_df.head())

print("\nLocation summary sample:")
print(location_summary.head())

print("\nDisaster percentages sample:")
print(disaster_pivot.head())

# Verify disaster types are strings, not numbers
print("\nVerifying disaster types are strings:")
print("Unique disaster types in forecast:", forecast_df['disaster_type'].unique())
print("Data type of disaster_type column:", forecast_df['disaster_type'].dtype)

# Check if any disaster types are still numbers
numeric_disasters = forecast_df[forecast_df['disaster_type'].apply(lambda x: isinstance(x, (int, np.int64, np.int32)))]
if len(numeric_disasters) > 0:
    print("\nWARNING: Found numeric disaster types:")
    print(numeric_disasters['disaster_type'].value_counts())
else:
    print("\nSUCCESS: All disaster types are strings!")

Loaded existing disaster mapping
Seasons in training data: ['Fall' 'Spring' 'Summer' 'Winter']
Model classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20]
Disaster mapping: {0: np.int64(0), 1: np.int64(1), 2: np.int64(2), 3: np.int64(3), 4: np.int64(4), 5: np.int64(5), 6: np.int64(6), 7: np.int64(7), 8: np.int64(8), 9: np.int64(9), 10: np.int64(10), 11: np.int64(11), 12: np.int64(12), 13: np.int64(13), 14: np.int64(14), 15: np.int64(15), 16: np.int64(17), 17: np.int64(18), 18: np.int64(19), 19: np.int64(20)}
Generating 5-year forecast (2024-2028)...
Forecast results saved successfully!
Forecast complete. Results saved.
Generated 28500 forecast records
Covering 475 locations
Forecast period: 2024 to 2028

Sample forecast data:
        location  year  month  disaster_type  prob_0  prob_1  prob_2  prob_3  \
0       Location  2024      1            6.0    0.13    0.04    0.05    0.02   
1         ESIAMA  2024      1            NaN    0.16    0.03    0.03    0.00   
2  AGON

In [18]:
# notebooks/04_forecasting.ipynb
import pandas as pd
import numpy as np
from datetime import datetime
import joblib
from sklearn.preprocessing import LabelEncoder
import os

# Create necessary directories if they don't exist
os.makedirs('../data/processed', exist_ok=True)

# Load data and models
historical_df = pd.read_csv('../data/processed/cleaned_disaster_data.csv')
model = joblib.load('../models/disaster_predictor.pkl')
location_encoder = joblib.load('../models/location_encoder.pkl')
disaster_encoder = joblib.load('../models/disaster_encoder.pkl')
season_encoder = joblib.load('../models/season_encoder.pkl')

# Check what seasons were in the training data
print("Seasons in training data:", season_encoder.classes_)
print("Model classes:", model.classes_)
print("Model classes length:", len(model.classes_))
print("Disaster encoder classes:", disaster_encoder.classes_)
print("Disaster encoder classes length:", len(disaster_encoder.classes_))

# Create a manual season mapping dictionary
season_mapping = {
    'Spring': 0,
    'Summer': 1,
    'Fall': 2,
    'Winter': 3
}

# Create a proper disaster mapping from the disaster encoder classes
# We'll map each integer index in the disaster encoder to the corresponding disaster type
disaster_mapping = {i: disaster_type for i, disaster_type in enumerate(disaster_encoder.classes_)}
print("Proper disaster mapping:", disaster_mapping)

# Save the correct disaster mapping
joblib.dump(disaster_mapping, '../models/disaster_mapping.pkl')

# Forecasting Functions (included directly in the notebook)
def generate_future_forecast(historical_data, model, location_encoder, disaster_encoder, season_encoder, 
                            start_year=2024, end_year=2028):
    """
    Generate disaster forecasts for the next 5 years (2024-2028) for each month and location
    """
    # Get unique locations
    locations = historical_data['location'].unique()
    
    # Create future dates
    future_dates = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            future_dates.append({
                'year': year,
                'month': month,
                'day': 15,  # Mid-month representation
                'date': pd.to_datetime(f'{year}-{month}-15')
            })
    
    # Create DataFrame for all location-date combinations
    forecast_data = []
    for date_info in future_dates:
        for location in locations:
            record = {
                'location': location,
                'year': date_info['year'],
                'month': date_info['month'],
                'day': date_info['day'],
                'date': date_info['date']
            }
            forecast_data.append(record)
    
    forecast_df = pd.DataFrame(forecast_data)
    
    # Add day of week
    forecast_df['day_of_week'] = forecast_df['date'].dt.dayofweek
    
    # Apply feature engineering
    forecast_df['location_encoded'] = location_encoder.transform(forecast_df['location'])
    
    # Create season
    forecast_df['season'] = forecast_df['month'].apply(lambda x: 
        'Spring' if x in [3,4,5] else
        'Summer' if x in [6,7,8] else
        'Fall' if x in [9,10,11] else 'Winter')
    
    # Encode season using manual mapping instead of the fitted encoder
    forecast_df['season_encoded'] = forecast_df['season'].map(season_mapping)
    
    # Calculate location risk from historical data
    location_risk = historical_data.groupby('location').size() / len(historical_data)
    forecast_df['location_risk'] = forecast_df['location'].map(location_risk)
    
    # Calculate disaster frequency from historical data
    disaster_freq = historical_data.groupby('disaster_type').size() / len(historical_data)
    # Use average disaster frequency for all predictions
    avg_disaster_freq = disaster_freq.mean()
    forecast_df['disaster_freq'] = avg_disaster_freq
    
    # Prepare features for prediction
    features = ['year', 'month', 'day', 'day_of_week', 'location_encoded', 
                'season_encoded', 'location_risk', 'disaster_freq']
    
    # Make predictions
    X_forecast = forecast_df[features]
    forecast_df['disaster_encoded'] = model.predict(X_forecast)
    
    # Get prediction probabilities
    proba = model.predict_proba(X_forecast)
    
    # Use model's classes_ attribute to get disaster classes
    disaster_classes = model.classes_
    
    # Add probability columns for each disaster type
    for i, disaster in enumerate(disaster_classes):
        if i < proba.shape[1]:  # Make sure we don't exceed the array bounds
            forecast_df[f'prob_{disaster}'] = proba[:, i]
        else:
            print(f"Warning: Skipping {disaster} due to index out of bounds")
    
    # Convert encoded disaster type back to string using the disaster mapping
    # This ensures we get actual disaster type names instead of numbers
    forecast_df['disaster_type'] = forecast_df['disaster_encoded'].map(disaster_mapping)
    
    # Select relevant columns
    prob_cols = [f'prob_{disaster}' for disaster in disaster_classes if f'prob_{disaster}' in forecast_df.columns]
    result_df = forecast_df[['location', 'year', 'month', 'disaster_type'] + prob_cols]
    
    return result_df

def aggregate_forecast_by_location(forecast_df):
    """
    Aggregate forecast data by location to show most likely disasters
    """
    # Group by location and get most common disaster type
    location_summary = forecast_df.groupby('location')['disaster_type'].apply(
        lambda x: x.mode()[0] if len(x) > 0 else 'Unknown'
    ).reset_index(name='most_likely_disaster')
    
    # Calculate disaster type percentages by location
    disaster_pct = forecast_df.groupby(['location', 'disaster_type']).size() / \
                  forecast_df.groupby('location').size() * 100
    disaster_pct = disaster_pct.reset_index(name='percentage')
    
    # Pivot for easier visualization
    disaster_pivot = disaster_pct.pivot(index='location', columns='disaster_type', values='percentage').fillna(0)
    
    return location_summary, disaster_pivot

def save_forecast_results(forecast_df, location_summary, disaster_pivot, output_dir):
    """
    Save forecast results to CSV files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save results
    forecast_df.to_csv(os.path.join(output_dir, 'forecast_results.csv'), index=False)
    location_summary.to_csv(os.path.join(output_dir, 'location_forecast_summary.csv'), index=False)
    disaster_pivot.to_csv(os.path.join(output_dir, 'disaster_percentages.csv'), index=True)
    
    print("Forecast results saved successfully!")

# Generate forecast
print("Generating 5-year forecast (2024-2028)...")
forecast_df = generate_future_forecast(
    historical_df, model, location_encoder, disaster_encoder, season_encoder
)

# Aggregate results
location_summary, disaster_pivot = aggregate_forecast_by_location(forecast_df)

# Save results
save_forecast_results(forecast_df, location_summary, disaster_pivot, '../data/processed')

print("Forecast complete. Results saved.")
print(f"Generated {len(forecast_df)} forecast records")
print(f"Covering {len(forecast_df['location'].unique())} locations")
print(f"Forecast period: {forecast_df['year'].min()} to {forecast_df['year'].max()}")

# Display sample of results
print("\nSample forecast data:")
print(forecast_df.head())

print("\nLocation summary sample:")
print(location_summary.head())

print("\nDisaster percentages sample:")
print(disaster_pivot.head())

# Verify disaster types are strings, not numbers
print("\nVerifying disaster types are strings:")
print("Unique disaster types in forecast:", sorted(forecast_df['disaster_type'].unique()))
print("Data type of disaster_type column:", forecast_df['disaster_type'].dtype)

# Check if any disaster types are still numbers
numeric_disasters = forecast_df[forecast_df['disaster_type'].apply(lambda x: isinstance(x, (int, np.int64, np.int32, float)))]
if len(numeric_disasters) > 0:
    print("\nWARNING: Found numeric disaster types:")
    print(numeric_disasters['disaster_type'].value_counts())
else:
    print("\nSUCCESS: All disaster types are strings!")

Seasons in training data: ['Fall' 'Spring' 'Summer' 'Winter']
Model classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20]
Model classes length: 20
Disaster encoder classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20]
Disaster encoder classes length: 20
Proper disaster mapping: {0: np.int64(0), 1: np.int64(1), 2: np.int64(2), 3: np.int64(3), 4: np.int64(4), 5: np.int64(5), 6: np.int64(6), 7: np.int64(7), 8: np.int64(8), 9: np.int64(9), 10: np.int64(10), 11: np.int64(11), 12: np.int64(12), 13: np.int64(13), 14: np.int64(14), 15: np.int64(15), 16: np.int64(17), 17: np.int64(18), 18: np.int64(19), 19: np.int64(20)}
Generating 5-year forecast (2024-2028)...
Forecast results saved successfully!
Forecast complete. Results saved.
Generated 28500 forecast records
Covering 475 locations
Forecast period: 2024 to 2028

Sample forecast data:
        location  year  month  disaster_type  prob_0  prob_1  prob_2  prob_3  \
0       Location  2024      1            6.0