In [3]:

import pandas as pd
import numpy as np
from datetime import datetime
import joblib
from sklearn.preprocessing import LabelEncoder
import os


os.makedirs('../data/processed', exist_ok=True)


historical_df = pd.read_csv('../data/processed/cleaned_disaster_data.csv')
model = joblib.load('../models/disaster_predictor.pkl')
location_encoder = joblib.load('../models/location_encoder.pkl')
disaster_encoder = joblib.load('../models/disaster_encoder.pkl')
season_encoder = joblib.load('../models/season_encoder.pkl')


disaster_names = historical_df['disaster_type'].unique()
print("Disaster names in historical data:", disaster_names)


sorted_disaster_names = sorted(disaster_names)
disaster_mapping = {i: name for i, name in enumerate(sorted_disaster_names)}
print("Created disaster mapping:", disaster_mapping)


joblib.dump(disaster_mapping, '../models/disaster_mapping.pkl')
print("Saved disaster mapping")


print("Seasons in training data:", season_encoder.classes_)
print("Model classes:", model.classes_)


season_mapping = {
    'Spring': 0,
    'Summer': 1,
    'Fall': 2,
    'Winter': 3
}


def generate_future_forecast(historical_data, model, location_encoder, disaster_encoder, season_encoder, 
                            start_year=2024, end_year=2028):
    """
    Generate disaster forecasts for the next 5 years (2024-2028) for each month and location
    """

    locations = historical_data['location'].unique()
    

    future_dates = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            future_dates.append({
                'year': year,
                'month': month,
                'day': 15,  
                'date': pd.to_datetime(f'{year}-{month}-15')
            })
    

    forecast_data = []
    for date_info in future_dates:
        for location in locations:
            record = {
                'location': location,
                'year': date_info['year'],
                'month': date_info['month'],
                'day': date_info['day'],
                'date': date_info['date']
            }
            forecast_data.append(record)
    
    forecast_df = pd.DataFrame(forecast_data)
    

    forecast_df['day_of_week'] = forecast_df['date'].dt.dayofweek
    

    forecast_df['location_encoded'] = location_encoder.transform(forecast_df['location'])
    

    forecast_df['season'] = forecast_df['month'].apply(lambda x: 
        'Spring' if x in [3,4,5] else
        'Summer' if x in [6,7,8] else
        'Fall' if x in [9,10,11] else 'Winter')
    

    forecast_df['season_encoded'] = forecast_df['season'].map(season_mapping)
    

    location_risk = historical_data.groupby('location').size() / len(historical_data)
    forecast_df['location_risk'] = forecast_df['location'].map(location_risk)
    

    disaster_freq = historical_data.groupby('disaster_type').size() / len(historical_data)

    avg_disaster_freq = disaster_freq.mean()
    forecast_df['disaster_freq'] = avg_disaster_freq
    

    features = ['year', 'month', 'day', 'day_of_week', 'location_encoded', 
                'season_encoded', 'location_risk', 'disaster_freq']
    

    X_forecast = forecast_df[features]
    forecast_df['disaster_encoded'] = model.predict(X_forecast)
    

    proba = model.predict_proba(X_forecast)
    

    disaster_classes = model.classes_
    

    for i, disaster_code in enumerate(disaster_classes):
        if i < proba.shape[1]: 
        
            disaster_name = disaster_mapping.get(disaster_code, f"Disaster_{disaster_code}")
            forecast_df[f'prob_{disaster_name}'] = proba[:, i]
        else:
            print(f"Warning: Skipping {disaster_code} due to index out of bounds")
    

    forecast_df['disaster_type'] = forecast_df['disaster_encoded'].map(disaster_mapping)
    

    prob_cols = [f'prob_{disaster_mapping.get(code, f"Disaster_{code}")}' for code in disaster_classes 
                if f'prob_{disaster_mapping.get(code, f"Disaster_{code}")}' in forecast_df.columns]
    result_df = forecast_df[['location', 'year', 'month', 'disaster_type'] + prob_cols]
    
    return result_df

def aggregate_forecast_by_location(forecast_df, disaster_mapping):
    """
    Aggregate forecast data by location to show most likely disasters
    """

    def convert_to_disaster_name(value):

        if pd.isna(value):
            return "Unknown"
        
        try:
        
            if isinstance(value, str) and value.replace('.', '', 1).isdigit():
                code = int(float(value))
                return disaster_mapping.get(code, value)
        
            elif isinstance(value, (int, float, np.int64, np.int32)):
                code = int(value)
                return disaster_mapping.get(code, value)
            
            else:
                return value
        except:
            return value
    

    forecast_df['disaster_type'] = forecast_df['disaster_type'].apply(convert_to_disaster_name)
    

    location_summary = forecast_df.groupby('location')['disaster_type'].apply(
        lambda x: x.mode()[0] if len(x) > 0 else 'Unknown'
    ).reset_index(name='most_likely_disaster')
    

    location_summary['most_likely_disaster'] = location_summary['most_likely_disaster'].apply(convert_to_disaster_name)
    

    disaster_pct = forecast_df.groupby(['location', 'disaster_type']).size() / \
                  forecast_df.groupby('location').size() * 100
    disaster_pct = disaster_pct.reset_index(name='percentage')
    

    disaster_pivot = disaster_pct.pivot(index='location', columns='disaster_type', values='percentage').fillna(0)
    
    return location_summary, disaster_pivot

def save_forecast_results(forecast_df, location_summary, disaster_pivot, output_dir):
    """
    Save forecast results to CSV files
    """

    os.makedirs(output_dir, exist_ok=True)
    

    forecast_df.to_csv(os.path.join(output_dir, 'forecast_results.csv'), index=False)
    location_summary.to_csv(os.path.join(output_dir, 'location_forecast_summary.csv'), index=False)
    disaster_pivot.to_csv(os.path.join(output_dir, 'disaster_percentages.csv'), index=True)
    
    print("Forecast results saved successfully!")


print("Generating 5-year forecast (2024-2028)...")
forecast_df = generate_future_forecast(
    historical_df, model, location_encoder, disaster_encoder, season_encoder
)


location_summary, disaster_pivot = aggregate_forecast_by_location(forecast_df, disaster_mapping)

save_forecast_results(forecast_df, location_summary, disaster_pivot, '../data/processed')

print("Forecast complete. Results saved.")
print(f"Generated {len(forecast_df)} forecast records")
print(f"Covering {len(forecast_df['location'].unique())} locations")
print(f"Forecast period: {forecast_df['year'].min()} to {forecast_df['year'].max()}")


print("\nSample forecast data:")
print(forecast_df.head())
print("\nLocation summary sample:")
print(location_summary.head())
print("\nDisaster percentages sample:")
print(disaster_pivot.head())


print("\nVerifying disaster types are strings:")
print("Unique disaster types in forecast:", forecast_df['disaster_type'].unique())
print("Data type of disaster_type column:", forecast_df['disaster_type'].dtype)
print("Unique most likely disasters:", location_summary['most_likely_disaster'].unique())
print("Data type of most_likely_disaster column:", location_summary['most_likely_disaster'].dtype)


numeric_disasters = forecast_df[forecast_df['disaster_type'].apply(lambda x: isinstance(x, (int, np.int64, np.int32, float)))]
if len(numeric_disasters) > 0:
    print("\nWARNING: Found numeric disaster types:")
    print(numeric_disasters['disaster_type'].value_counts())
else:
    print("\nSUCCESS: All disaster types are strings!")


numeric_most_likely = location_summary[location_summary['most_likely_disaster'].apply(lambda x: isinstance(x, (int, np.int64, np.int32, float)))]
if len(numeric_most_likely) > 0:
    print("\nWARNING: Found numeric most_likely_disaster:")
    print(numeric_most_likely['most_likely_disaster'].value_counts())
else:
    print("\nSUCCESS: All most_likely_disaster are strings!")

Disaster names in historical data: ['DISASTER_TYPE' 'FIRE' 'WIND_STORM' 'FLOOD' 'RAIN_STORM' 'TIDAL_WAVE'
 'MAN_MADE' 'PEST_INFESTATION' 'LANDSLIDE' 'LIGHTNING' 'PEST INFESTATION'
 'COLLAPSE' 'EPIDEMIC' 'ACCIDENT' 'EXPLOSION' 'DROWNING' 'COMERCIAL FIRE'
 'MAN MADE(BUILDING COLLAPSE)' 'INDUSTRIAL FIRE' 'DROWN'
 'MAN MADE (BUILDING COLLAPSE)']
Created disaster mapping: {0: 'ACCIDENT', 1: 'COLLAPSE', 2: 'COMERCIAL FIRE', 3: 'DISASTER_TYPE', 4: 'DROWN', 5: 'DROWNING', 6: 'EPIDEMIC', 7: 'EXPLOSION', 8: 'FIRE', 9: 'FLOOD', 10: 'INDUSTRIAL FIRE', 11: 'LANDSLIDE', 12: 'LIGHTNING', 13: 'MAN MADE (BUILDING COLLAPSE)', 14: 'MAN MADE(BUILDING COLLAPSE)', 15: 'MAN_MADE', 16: 'PEST INFESTATION', 17: 'PEST_INFESTATION', 18: 'RAIN_STORM', 19: 'TIDAL_WAVE', 20: 'WIND_STORM'}
Saved disaster mapping
Seasons in training data: ['Fall' 'Spring' 'Summer' 'Winter']
Model classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20]
Generating 5-year forecast (2024-2028)...
Forecast results saved succ