In [6]:
# notebooks/03_modeling.ipynb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

# Create necessary directories if they don't exist
os.makedirs('../models', exist_ok=True)

# Load data
df = pd.read_csv('../data/processed/cleaned_disaster_data.csv')

# Feature Engineering Functions (included directly)
def create_features(df):
    # Location encoding
    location_encoder = LabelEncoder()
    df['location_encoded'] = location_encoder.fit_transform(df['location'])
    
    # Disaster type encoding
    disaster_encoder = LabelEncoder()
    df['disaster_encoded'] = disaster_encoder.fit_transform(df['disaster_type'])
    
    # Season feature
    df['season'] = df['month'].apply(lambda x: 
        'Spring' if x in [3,4,5] else
        'Summer' if x in [6,7,8] else
        'Fall' if x in [9,10,11] else 'Winter')
    
    # Season encoding
    season_encoder = LabelEncoder()
    df['season_encoded'] = season_encoder.fit_transform(df['season'])
    
    # Location disaster frequency (historical risk)
    location_risk = df.groupby('location').size() / len(df)
    df['location_risk'] = df['location'].map(location_risk)
    
    # Disaster type frequency
    disaster_freq = df.groupby('disaster_type').size() / len(df)
    df['disaster_freq'] = df['disaster_type'].map(disaster_freq)
    
    return df, location_encoder, disaster_encoder, season_encoder

# Model Training Functions (included directly)
def train_predictive_model(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    return model, X_test, y_test

def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    return accuracy, report, cm

# Create features
df, location_encoder, disaster_encoder, season_encoder = create_features(df)

# Prepare features and target
features = ['year', 'month', 'day', 'day_of_week', 'location_encoded', 
            'season_encoded', 'location_risk', 'disaster_freq']
X = df[features]
y = df['disaster_encoded']

# Train model
model, X_test, y_test = train_predictive_model(X, y)

# Evaluate model
accuracy, report, cm = evaluate_model(model, X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

# Save model
joblib.dump(model, '../models/disaster_predictor.pkl')
joblib.dump(location_encoder, '../models/location_encoder.pkl')
joblib.dump(disaster_encoder, '../models/disaster_encoder.pkl')
joblib.dump(season_encoder, '../models/season_encoder.pkl')

print("Model and encoders saved successfully!")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Model Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       125
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         0

    accuracy                           0.98       127
   macro avg       0.20      0.20      0.20       127
weighted avg       0.98      0.98      0.98       127

Model and encoders saved successfully!


In [11]:
# notebooks/03_modeling.ipynb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

# Create necessary directories if they don't exist
os.makedirs('../models', exist_ok=True)

# Load data
df = pd.read_csv('../data/processed/cleaned_disaster_data.csv')

# Feature Engineering Functions (included directly)
def create_features(df):
    # Location encoding
    location_encoder = LabelEncoder()
    df['location_encoded'] = location_encoder.fit_transform(df['location'])
    
    # Disaster type encoding
    disaster_encoder = LabelEncoder()
    df['disaster_encoded'] = disaster_encoder.fit_transform(df['disaster_type'])
    
    # Season feature
    df['season'] = df['month'].apply(lambda x: 
        'Spring' if x in [3,4,5] else
        'Summer' if x in [6,7,8] else
        'Fall' if x in [9,10,11] else 'Winter')
    
    # Ensure all seasons are represented in the data
    all_seasons = ['Spring', 'Summer', 'Fall', 'Winter']
    existing_seasons = df['season'].unique()
    
    missing_seasons = [s for s in all_seasons if s not in existing_seasons]
    if missing_seasons:
        print(f"Adding dummy rows for missing seasons: {missing_seasons}")
        for season in missing_seasons:
            # Create a dummy row for each missing season
            dummy_row = df.iloc[0].copy()
            dummy_row['season'] = season
            # Set month to a value corresponding to the season
            if season == 'Spring':
                dummy_row['month'] = 4
            elif season == 'Summer':
                dummy_row['month'] = 7
            elif season == 'Fall':
                dummy_row['month'] = 10
            else:  # Winter
                dummy_row['month'] = 1
            
            # Add the dummy row
            df = pd.concat([df, dummy_row.to_frame().T], ignore_index=True)
    
    # Season encoding
    season_encoder = LabelEncoder()
    df['season_encoded'] = season_encoder.fit_transform(df['season'])
    
    # Location disaster frequency (historical risk)
    location_risk = df.groupby('location').size() / len(df)
    df['location_risk'] = df['location'].map(location_risk)
    
    # Disaster type frequency
    disaster_freq = df.groupby('disaster_type').size() / len(df)
    df['disaster_freq'] = df['disaster_type'].map(disaster_freq)
    
    return df, location_encoder, disaster_encoder, season_encoder

# Model Training Functions (included directly)
def train_predictive_model(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    return model, X_test, y_test

def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    return accuracy, report, cm

# Create features
df, location_encoder, disaster_encoder, season_encoder = create_features(df)

# Check what seasons are in the data
print("Seasons in data:", season_encoder.classes_)
print("Disaster types in data:", disaster_encoder.classes_)

# Prepare features and target
features = ['year', 'month', 'day', 'day_of_week', 'location_encoded', 
            'season_encoded', 'location_risk', 'disaster_freq']
X = df[features]
y = df['disaster_encoded']

# Train model
model, X_test, y_test = train_predictive_model(X, y)

# Check model classes
print("Model classes:", model.classes_)

# Verify that model classes match disaster encoder classes
if not np.array_equal(model.classes_, disaster_encoder.classes_):
    print("WARNING: Model classes do not match disaster encoder classes!")
    print("Model classes:", model.classes_)
    print("Disaster encoder classes:", disaster_encoder.classes_)
    # Use model classes as the source of truth
    disaster_encoder.classes_ = model.classes_
    print("Updated disaster encoder classes to match model classes")

# Evaluate model
accuracy, report, cm = evaluate_model(model, X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

# Save model and encoders
joblib.dump(model, '../models/disaster_predictor.pkl')
joblib.dump(location_encoder, '../models/location_encoder.pkl')
joblib.dump(disaster_encoder, '../models/disaster_encoder.pkl')
joblib.dump(season_encoder, '../models/season_encoder.pkl')

# Also save the model's classes for later use
joblib.dump(model.classes_, '../models/model_classes.pkl')

# Create a mapping dictionary for disaster types
disaster_mapping = {i: disaster_type for i, disaster_type in enumerate(model.classes_)}
joblib.dump(disaster_mapping, '../models/disaster_mapping.pkl')

print("Model and encoders saved successfully!")
print("Disaster mapping:", disaster_mapping)

Seasons in data: ['Fall' 'Spring' 'Summer' 'Winter']
Disaster types in data: ['ACCIDENT' 'COLLAPSE' 'COMERCIAL FIRE' 'DISASTER_TYPE' 'DROWN' 'DROWNING'
 'EPIDEMIC' 'EXPLOSION' 'FIRE' 'FLOOD' 'INDUSTRIAL FIRE' 'LANDSLIDE'
 'LIGHTNING' 'MAN MADE (BUILDING COLLAPSE)' 'MAN MADE(BUILDING COLLAPSE)'
 'MAN_MADE' 'PEST INFESTATION' 'PEST_INFESTATION' 'RAIN_STORM'
 'TIDAL_WAVE' 'WIND_STORM']


Model classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20]
Model classes: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20]
Disaster encoder classes: ['ACCIDENT' 'COLLAPSE' 'COMERCIAL FIRE' 'DISASTER_TYPE' 'DROWN' 'DROWNING'
 'EPIDEMIC' 'EXPLOSION' 'FIRE' 'FLOOD' 'INDUSTRIAL FIRE' 'LANDSLIDE'
 'LIGHTNING' 'MAN MADE (BUILDING COLLAPSE)' 'MAN MADE(BUILDING COLLAPSE)'
 'MAN_MADE' 'PEST INFESTATION' 'PEST_INFESTATION' 'RAIN_STORM'
 'TIDAL_WAVE' 'WIND_STORM']
Updated disaster encoder classes to match model classes
Model Accuracy: 0.93
Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         1
           8       1.00      1.00      1.00        35
           9       1.00      1.00      1.00        36
          11       0.00      0.00      0.00         1
          15       1.00      0.33      0.50         3
          16       0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Model and encoders saved successfully!
Disaster mapping: {0: np.int64(0), 1: np.int64(1), 2: np.int64(2), 3: np.int64(3), 4: np.int64(4), 5: np.int64(5), 6: np.int64(6), 7: np.int64(7), 8: np.int64(8), 9: np.int64(9), 10: np.int64(10), 11: np.int64(11), 12: np.int64(12), 13: np.int64(13), 14: np.int64(14), 15: np.int64(15), 16: np.int64(17), 17: np.int64(18), 18: np.int64(19), 19: np.int64(20)}
