In [1]:
### Importing necessary libraries

In [5]:
# Cell 1: Setup and Data Loading
import pandas as pd
import numpy as np
import requests
import json
from datetime import datetime, timedelta
import joblib
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


# Function to calculate Vapor Pressure Deficit (VPD)
def calculate_vpd(temperature_celsius, relative_humidity):
    """
    Calculates Vapor Pressure Deficit (VPD) in kPa.
    Source: https://www.fruit.wisc.edu/VPD-and-Relative-Humidity-Calculations/
    """
    if temperature_celsius is None or relative_humidity is None:
        return None

    # Saturation Vapor Pressure (Es) in kPa
    # Magnus-Tetens formula (valid for T > 0 deg C)
    Es = 0.6108 * np.exp((17.27 * temperature_celsius) / (temperature_celsius + 237.3))

    # Actual Vapor Pressure (Ea) in kPa
    Ea = (relative_humidity / 100) * Es

    # Vapor Pressure Deficit (VPD) in kPa
    VPD = Es - Ea
    return VPD


In [6]:
# Cell 3: Data Cleaning and Preparation
import pandas as pd
import numpy as np # Ensure numpy is imported for array([])
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

try:
    df = pd.read_csv('fire_dataset_enriched.csv')

    # Handle missing values by dropping rows
    initial_rows = df.shape[0]
    df = df.dropna()
    rows_after_dropna = df.shape[0]
    print(f"Missing values handled. {initial_rows - rows_after_dropna} rows dropped.")

    # Drop non-numeric or non-useful columns for model training
    # Check if 'acq_date' exists before dropping to prevent KeyError
    if 'acq_date' in df.columns:
        df = df.drop(columns=['acq_date'])
    else:
        print("Warning: 'acq_date' column not found, skipping drop.")

    # Define features (X) and target (y)
    # ENSURE X AND Y ARE NUMPY ARRAYS HERE BY ADDING .values
    X = df.drop(columns=['fire_occurred']).values # X is now a NumPy array
    y = df['fire_occurred'].values              # y is now a NumPy array

    # Train-Test Split (80-20) - train_test_split handles NumPy arrays well
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scale the features - StandardScaler takes NumPy arrays
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train) # Returns NumPy array
    X_test_scaled = scaler.transform(X_test)     # Returns NumPy array

    # Save the scaler for future use in manual predictions
    joblib.dump(scaler, 'scaler.pkl')

    # Save test set - Ensure this block is present
    # For saving, we need X_test as a DataFrame, so we recreate it temporarily if needed
    # Or, if you want to save the original unscaled test data
    # Let's use the unscaled X_test (which is a NumPy array from the split) and convert it back for saving
    test_df_to_save = pd.DataFrame(X_test, columns=df.drop(columns=['fire_occurred']).columns) # Recreate DataFrame for saving
    test_df_to_save["fire_occurred"] = y_test
    test_df_to_save.to_csv("fire_test_dataset.csv", index=False)
    print("Test dataset saved as fire_test_dataset.csv")

    print("Data cleaning and splitting complete.")
    print("X_train_scaled shape:", X_train_scaled.shape)
    print("X_test_scaled shape:", X_test_scaled.shape)

except FileNotFoundError:
    print("Error: 'fire_dataset_enriched.csv' not found. Please run previous cells to generate it.")
    X_train_scaled, X_test_scaled, y_train, y_test = np.array([]), np.array([]), np.array([]), np.array([])
except Exception as e:
    print(f"An error occurred during data cleaning and preparation: {e}")
    X_train_scaled, X_test_scaled, y_train, y_test = np.array([]), np.array([]), np.array([]), np.array([])

Missing values handled. 8 rows dropped.
Test dataset saved as fire_test_dataset.csv
Data cleaning and splitting complete.
X_train_scaled shape: (793, 8)
X_test_scaled shape: (199, 8)


In [18]:
# Cell 4: Random Forest from Scratch Implementation
# Gini Impurity Function
def gini(y):
    counts = Counter(y)
    impurity = 1 - sum((c / len(y))**2 for c in counts.values())
    return impurity

# Dataset Split
def split_dataset(X, y, feature_index, threshold):
    left_indices = [i for i in range(len(X)) if X[i][feature_index] <= threshold]
    right_indices = [i for i in range(len(X)) if X[i][feature_index] > threshold]
    return (X[left_indices], y[left_indices]), (X[right_indices], y[right_indices])

# Best Split Calculation
def best_split(X, y, features):
    best_gain = 0
    best_feature, best_threshold = None, None
    current_gini = gini(y)

    for feature_idx in features:
        thresholds = np.unique(X[:, feature_idx])
        for t in thresholds:
            (X_left, y_left), (X_right, y_right) = split_dataset(X, y, feature_idx, t)
            
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            
            gain = current_gini - (\
                (len(y_left)/len(y)) * gini(y_left) +\
                (len(y_right)/len(y)) * gini(y_right)\
            )
            
            if gain > best_gain:
                best_gain = gain
                best_feature = feature_idx
                best_threshold = t
    return best_feature, best_threshold

# Tree Node Class
class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value # Class label for leaf node

    def is_leaf(self):
        return self.value is not None

# Build Decision Tree
def build_tree(X, y, depth=0, max_depth=10, min_samples=5, num_features=None):
    # Stop splitting if conditions are met
    if len(set(y)) == 1 or len(y) < min_samples or depth >= max_depth:
        # Return leaf node with most common class
        return TreeNode(value=Counter(y).most_common(1)[0][0])

    n_features = X.shape[1]
    # Select a subset of features for Random Forest (feature bagging)
    features_to_consider = np.random.choice(n_features, num_features or n_features, replace=False)
    
    best_feat, best_thresh = best_split(X, y, features_to_consider)

    # If no split improves Gini impurity, make it a leaf node
    if best_feat is None:
        return TreeNode(value=Counter(y).most_common(1)[0][0])

    (X_left, y_left), (X_right, y_right) = split_dataset(X, y, best_feat, best_thresh)
    
    # Recursively build left and right sub-trees
    left_branch = build_tree(X_left, y_left, depth + 1, max_depth, min_samples, num_features)
    right_branch = build_tree(X_right, y_right, depth + 1, max_depth, min_samples, num_features)

    return TreeNode(feature=best_feat, threshold=best_thresh, left=left_branch, right=right_branch)

# Tree Prediction
def predict_tree(x, tree):
    if tree.is_leaf():
        return tree.value
    
    # Handle potential None for tree.feature (if prediction on data with less features than training)
    if tree.feature is None or tree.feature >= len(x):
        # Fallback to majority class of this node if feature is invalid
        return tree.value if tree.value is not None else 0 # Or raise error
    
    if x[tree.feature] <= tree.threshold:
        return predict_tree(x, tree.left)
    else:
        return predict_tree(x, tree.right)

# Random Forest Class
class RandomForest:
    def __init__(self, n_trees=100, max_depth=10, min_samples=5):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_features_sqrt = int(np.sqrt(X.shape[1])) # Features to consider at each split
        for _ in range(self.n_trees):
            # Bootstrap sampling
            indices = np.random.choice(len(X), len(X), replace=True)
            X_sample = X[indices]
            y_sample = y[indices]
            
            # Build a tree
            tree = build_tree(X_sample, y_sample, max_depth=self.max_depth, 
                              min_samples=self.min_samples, num_features=n_features_sqrt)
            self.trees.append(tree)

    def predict(self, X):
        # Get predictions from all trees
        tree_preds = np.array([[predict_tree(x, tree) for tree in self.trees] for x in X])
        # Majority vote for final prediction
        return [Counter(row).most_common(1)[0][0] for row in tree_preds]

print("Random Forest implementation loaded.")

Random Forest implementation loaded.


In [7]:
from custom_rf import RandomForest

In [8]:
# Cell 5: Model Training and Evaluation

if X_train_scaled.size > 0:
    # Train the Random Forest model
    # You can adjust n_trees, max_depth, min_samples
    forest = RandomForest(n_trees=100, max_depth=10, min_samples=5) # Increased n_trees for better performance
    print("Training Random Forest model...")
    forest.fit(X_train_scaled, y_train)
    print("Training complete.")

    # Make predictions and evaluate
    predictions = forest.predict(X_test_scaled)

    # Accuracy
    accuracy = accuracy_score(y_test, predictions)
    print(f"\nAccuracy of Random Forest (from scratch): {accuracy:.4f}")

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, predictions)
    print("\nConfusion Matrix:")
    print(conf_matrix)

    # Classification Report
    class_report = classification_report(y_test, predictions)
    print("\nClassification Report:")
    print(class_report)

    # Save the trained model
    joblib.dump(forest, 'random_forest_final_model.pkl')
    print("Random Forest model saved as 'random_forest_final_model.pkl'")
else:
    print("Skipping model training and evaluation: Training data not available.")

Training Random Forest model...


IndexError: list index out of range

In [16]:
# Cell 6: Manual Prediction Function with Automatic VPD Calculation

# Ensure calculate_vpd function is available (it's defined in Cell 1 of the full code)
# If running this cell independently, you would need to define calculate_vpd here or run Cell 1 first.

import pandas as pd
import joblib
import numpy as np # Needed for array operations in predict_manual_input
# Assuming calculate_vpd is defined from Cell 1

# Reload the scaler and model if starting from this cell
try:
    scaler = joblib.load('scaler.pkl')
    model = joblib.load('random_forest_final_model.pkl')
    print("Scaler and Random Forest model loaded for manual prediction.")
except FileNotFoundError:
    print("Error: 'scaler.pkl' or 'random_forest_final_model.pkl' not found. Please run previous cells to train and save them.")
    scaler = None
    model = None

if scaler and model:
    # Feature names in the order the model expects (after 'acq_date' and 'fire_occurred' removal)
    # This order should match the features in df.drop(columns=['fire_occurred', 'acq_date'])
    feature_names_for_manual_input = [
        'latitude', 'longitude', 'temperature', 'humidity',
        'wind_speed', 'precipitation', 'elevation', 'vpd'
    ]

    def predict_manual_input_interactive():
        """
        Interactively takes manual inputs from the user and makes a fire prediction.
        VPD is automatically calculated from temperature and humidity.
        """
        print("\nEnter parameters for prediction:")
        
        input_values = {}
        # List of features to ask the user for (VPD is calculated, not asked)
        features_to_ask = [
            'latitude', 'longitude', 'temperature', 'humidity',
            'wind_speed', 'precipitation', 'elevation'
        ]

        for feature in features_to_ask:
            while True:
                try:
                    val = float(input(f"Enter {feature}: "))
                    input_values[feature] = val
                    break
                except ValueError:
                    print("Invalid input. Please enter a numerical value.")
        
        # Automatically calculate VPD
        temperature = input_values.get('temperature')
        humidity = input_values.get('humidity')
        vpd = calculate_vpd(temperature, humidity) # Make sure calculate_vpd is defined (e.g., from Cell 1)
        
        # Add VPD to the dictionary
        input_values['vpd'] = vpd

        # Create a DataFrame for a single prediction, ensuring column order matches training data
        # Convert the dictionary to a list of values in the correct order
        ordered_input = [input_values[feat] for feat in feature_names_for_manual_input]
        input_df = pd.DataFrame([ordered_input], columns=feature_names_for_manual_input)

        # Convert to numpy array and scale
        input_scaled = scaler.transform(input_df.values)

        # Make prediction
        prediction = model.predict(input_scaled)[0] # [0] because predict returns a list

        return "FIRE" if prediction == 1 else "NO FIRE"

    print("\n--- Interactive Manual Input Prediction ---")
    predicted_fire = predict_manual_input_interactive()
    print(f"\n🔥 Prediction for given input: {predicted_fire}")

else:
    print("Manual prediction function not available due to missing scaler or model. Please run previous cells to train and save them.")

Scaler and Random Forest model loaded for manual prediction.

--- Interactive Manual Input Prediction ---

Enter parameters for prediction:


Enter latitude:  1
Enter longitude:  1
Enter temperature:  4
Enter humidity:  56
Enter wind_speed:  45
Enter precipitation:  56
Enter elevation:  45



🔥 Prediction for given input: NO FIRE


Enter latitude:  145
Enter longitude:  67
Enter temperature:  80
Enter humidity:  46
Enter wind_speed:  67
Enter precipitation:  80
Enter elevation:  1467



🔥 Prediction for given input: FIRE


In [None]:
 # Example usage
    # Example input data:
    # Be careful with the data types and order here.
    # Replace with actual values for testing
    # example_input = {
    #     'latitude': 27.5,
    #     'longitude': 85.0,
    #     'temperature': 30.0,
    #     'humidity': 50.0,
    #     'wind_speed': 3.0,
    #     'precipitation': 0.1,
    #     'elevation': 500.0
    # }
    # predicted_fire = predict_manual_input(example_input)

In [20]:
import joblib
model = joblib.load('random_forest_final_model.pkl')
print(type(model))
print(model.__class__.__module__)

<class '__main__.RandomForest'>
__main__
