In [1]:
# Cell 1: Setup and Data Loading
import pandas as pd
import numpy as np
import requests
import json
from datetime import datetime, timedelta
import joblib
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


# Function to calculate Vapor Pressure Deficit (VPD)
def calculate_vpd(temperature_celsius, relative_humidity):
    """
    Calculates Vapor Pressure Deficit (VPD) in kPa.
    Source: https://www.fruit.wisc.edu/VPD-and-Relative-Humidity-Calculations/
    """
    if temperature_celsius is None or relative_humidity is None:
        return None

    # Saturation Vapor Pressure (Es) in kPa
    # Magnus-Tetens formula (valid for T > 0 deg C)
    Es = 0.6108 * np.exp((17.27 * temperature_celsius) / (temperature_celsius + 237.3))

    # Actual Vapor Pressure (Ea) in kPa
    Ea = (relative_humidity / 100) * Es

    # Vapor Pressure Deficit (VPD) in kPa
    VPD = Es - Ea
    return VPD


In [2]:
# Cell 3: Data Cleaning and Preparation
import pandas as pd
import numpy as np # Ensure numpy is imported for array([])
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

try:
    df = pd.read_csv('fire_dataset_enriched.csv')

    # Handle missing values by dropping rows
    initial_rows = df.shape[0]
    df = df.dropna()
    rows_after_dropna = df.shape[0]
    print(f"Missing values handled. {initial_rows - rows_after_dropna} rows dropped.")

    # Drop non-numeric or non-useful columns for model training
    # Check if 'acq_date' exists before dropping to prevent KeyError
    if 'acq_date' in df.columns:
        df = df.drop(columns=['acq_date'])
    else:
        print("Warning: 'acq_date' column not found, skipping drop.")

    # Define features (X) and target (y)
    # ENSURE X AND Y ARE NUMPY ARRAYS HERE BY ADDING .values
    X = df.drop(columns=['fire_occurred']).values # X is now a NumPy array
    y = df['fire_occurred'].values              # y is now a NumPy array

    # Train-Test Split (80-20) - train_test_split handles NumPy arrays well
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scale the features - StandardScaler takes NumPy arrays
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train) # Returns NumPy array
    X_test_scaled = scaler.transform(X_test)     # Returns NumPy array

    # Save the scaler for future use in manual predictions
    joblib.dump(scaler, 'scaler.pkl')

    # Save test set - Ensure this block is present
    # For saving, we need X_test as a DataFrame, so we recreate it temporarily if needed
    # Or, if you want to save the original unscaled test data
    # Let's use the unscaled X_test (which is a NumPy array from the split) and convert it back for saving
    test_df_to_save = pd.DataFrame(X_test, columns=df.drop(columns=['fire_occurred']).columns) # Recreate DataFrame for saving
    test_df_to_save["fire_occurred"] = y_test
    test_df_to_save.to_csv("fire_test_dataset.csv", index=False)
    print("Test dataset saved as fire_test_dataset.csv")

    print("Data cleaning and splitting complete.")
    print("X_train_scaled shape:", X_train_scaled.shape)
    print("X_test_scaled shape:", X_test_scaled.shape)

except FileNotFoundError:
    print("Error: 'fire_dataset_enriched.csv' not found. Please run previous cells to generate it.")
    X_train_scaled, X_test_scaled, y_train, y_test = np.array([]), np.array([]), np.array([]), np.array([])
except Exception as e:
    print(f"An error occurred during data cleaning and preparation: {e}")
    X_train_scaled, X_test_scaled, y_train, y_test = np.array([]), np.array([]), np.array([]), np.array([])

Missing values handled. 8 rows dropped.
Test dataset saved as fire_test_dataset.csv
Data cleaning and splitting complete.
X_train_scaled shape: (793, 8)
X_test_scaled shape: (199, 8)


In [3]:
from custom_rf import RandomForest

In [4]:
# Cell 5: Model Training and Evaluation
import joblib 

if X_train_scaled.size > 0:
    # Train the Random Forest model
    # You can adjust n_trees, max_depth, min_samples
    forest = RandomForest(n_trees=100, max_depth=10, min_samples=5) # Increased n_trees for better performance
    print("Training Random Forest model...")
    forest.fit(X_train_scaled, y_train)
    print("Training complete.")

    # Make predictions and evaluate
    predictions = forest.predict(X_test_scaled)

    # Accuracy
    accuracy = accuracy_score(y_test, predictions)
    print(f"\nAccuracy of Random Forest (from scratch): {accuracy:.4f}")

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, predictions)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Classification Report
    class_report = classification_report(y_test, predictions)
    print("\nClassification Report:")
    print(class_report)

    # Save the trained model
    joblib.dump(forest, 'random_forest_final_model.pkl')
    print("Random Forest model saved as 'random_forest_final_model.pkl'")
else:
    print("Skipping model training and evaluation: Training data not available.")

Training Random Forest model...
Training complete.

Accuracy of Random Forest (from scratch): 0.8894

Confusion Matrix:
[[84 16]
 [ 6 93]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       100
           1       0.85      0.94      0.89        99

    accuracy                           0.89       199
   macro avg       0.89      0.89      0.89       199
weighted avg       0.89      0.89      0.89       199

Random Forest model saved as 'random_forest_final_model.pkl'


In [5]:
import joblib
model = joblib.load('random_forest_final_model.pkl')
print(type(model))
print(model.__class__.__module__)

<class 'custom_rf.RandomForest'>
custom_rf
