In [36]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import math
import pickle

# Load the dataset
file_name = '/content/wildfire_prediction_multi_output_dataset_v2.xlsx'  # Ensure this file is in the same directory as your script
df = pd.read_excel(file_name)

# Display the first few rows of the dataset
print(df.head())

# Check if the data contains null values
print(df.isna().sum())

# Check datatypes
print(df.dtypes)

#Dealing with missing data

#Separate numeric and non-numeric columns
numeric_df = df.select_dtypes(include=[np.number])
non_numeric_df = df.select_dtypes(exclude=[np.number])
#Replace missing numeric values with mean for each column
numeric_df = numeric_df.fillna(numeric_df.mean())
#Concatenate fixed numeric cols. with non-numeric cols.
df = pd.concat([non_numeric_df, numeric_df], axis=1)

# Check for duplicates in the entire dataset
duplicates = df.duplicated()
# If there are any duplicates, the 'duplicates' variable will contain True for those rows
if duplicates.any():
    # Get the rows with duplicates
    duplicate_rows = df[duplicates]
    print("Duplicate rows:")
    print(duplicate_rows)
else:
    print("No duplicates found in the dataset.")

# Data Preprocessing
# Encode categorical variables
label_encoder = LabelEncoder()

# Apply label encoding to the target column (Fire Occurrence)
df['Fire Occurrence'] = label_encoder.fit_transform(df['Fire Occurrence'])

# Save the label encoder for future use
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

#One-hot encode categorical features
veg_encoded = pd.get_dummies(df['Vegetation Type'])
region_encoded = pd.get_dummies(df['Region'])
df = pd.concat([df, veg_encoded, region_encoded], axis=1)
df.drop(['Vegetation Type', 'Region'], axis=1, inplace=True)

   Temperature (°C)  Humidity (%)  Wind Speed (km/h)  Rainfall (mm)  \
0         27.733250     89.108222          13.165364      44.032705   
1         28.789452     21.543283          37.853201      42.478192   
2         30.256666     72.335215          19.592377      12.104919   
3         17.856476     84.138818           5.448178            NaN   
4         34.260178     62.698878          13.356131      27.465001   

   Fuel Moisture (%) Vegetation Type  Slope (%) Region  Fire Size (hectares)  \
0          16.072075       Grassland  59.615524  South            442.648940   
1          25.176666          Forest  26.995937  South             17.689832   
2          22.251979       Shrubland  55.623107  South            256.142917   
3          20.820927          Forest   3.944492   East            165.111292   
4           6.842731       Shrubland  54.280029   West            160.298123   

   Fire Duration (hours)  Suppression Cost ($) Fire Occurrence  
0              11.616707   

In [42]:
# Splitting the dataset into features and targets
X_occ = df.drop(['Fire Occurrence', 'Fire Size (hectares)', 'Fire Duration (hours)', 'Suppression Cost ($)'], axis=1)
y_occ = df['Fire Occurrence']

X_num = df.drop(['Fire Occurrence', 'Fire Size (hectares)', 'Fire Duration (hours)', 'Suppression Cost ($)'], axis=1)
y_num = df[['Fire Size (hectares)', 'Fire Duration (hours)', 'Suppression Cost ($)']]


# Split data into train and test sets (80% train, 20% test)
X_occ_train, X_occ_test, y_occ_train, y_occ_test = train_test_split(X_occ, y_occ, test_size=0.2, random_state=42)
X_num_train, X_num_test, y_num_train, y_num_test = train_test_split(X_num, y_num, test_size=0.2, random_state=42)

# Standardizing the data (for certain models like SVM)
scaler = StandardScaler()
X_occ_train_scaled = scaler.fit_transform(X_occ_train)
X_occ_test_scaled = scaler.transform(X_occ_test)
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

# Machine Learning Models for occurrence
occ_models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Training and evaluating the occurrence models
best_occ_model = None
best_occ_rmse = float('inf')

for model_name, model in occ_models.items():
    print(f"Training {model_name}...")

    # Use scaled data for SVM, for others, use non-scaled data
    if model_name == "SVM":
        model.fit(X_occ_train_scaled, y_occ_train)
        y_occ_test_pred = model.predict(X_occ_test_scaled)
    else:
        model.fit(X_occ_train, y_occ_train)
        y_occ_test_pred = model.predict(X_occ_test)

    # Calculate RMSE for the occurrence test dataset
    rmse_occ_test = math.sqrt(mean_squared_error(y_occ_test, y_occ_test_pred))

    # Save the best model based on RMSE for the test dataset
    if rmse_occ_test < best_rmse:
        best_rmse = rmse_occ_test
        best_model = model
        best_model_name = model_name

    print(f"{model_name} RMSE on Test Data: {rmse_test}")

# Save the best model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"Best model: {best_model_name} with RMSE: {best_rmse}")

Training Random Forest...
Random Forest RMSE on Test Data: 0.5513619500836089
Training Decision Tree...
Decision Tree RMSE on Test Data: 0.5513619500836089
Training Logistic Regression...
Logistic Regression RMSE on Test Data: 0.5513619500836089
Training Gradient Boosting...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Gradient Boosting RMSE on Test Data: 0.5513619500836089
Best model: Gradient Boosting with RMSE: 0.46501854237039825


App.py

In [None]:
import subprocess
import sys

# Force install scikit-learn if not found
try:
    import sklearn
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])
    import sklearn  # Import again after installation

import gradio as gr
import pandas as pd
import pickle

# Load the pre-trained model
with open('best_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Load the label encoder
with open('label_encoder.pkl', 'rb') as label_encoder_file:
    label_encoder = pickle.load(label_encoder_file)

def predict_wildfire(temp, humidity, wind_speed, rainfall, fuel_moisture, vegetation_type, slope, region):
    # Creating input DataFrame for the model
    input_data = pd.DataFrame({
        'Temperature (°C)': [temp],
        'Humidity (%)': [humidity],
        'Wind Speed (km/h)': [wind_speed],
        'Rainfall (mm)': [rainfall],
        'Fuel Moisture (%)': [fuel_moisture],
        'Vegetation Type': [vegetation_type],
        'Slope (%)': [slope],
        'Region': [region]
    })

   # One-hot encode the input data (ensure it matches the training data)
    input_encoded = pd.get_dummies(input_data)

     # Align columns with the training data (required columns)
    required_columns = model.feature_names_in_  # Get the feature columns from the model
    for col in required_columns:
        if col not in input_encoded.columns:
            input_encoded[col] = 0
    input_encoded = input_encoded[required_columns]

    # Make the prediction
    prediction = model.predict(input_encoded)[0]

    # Reverse the label encoding (map the prediction back to the wildfire occurrence)
    fire_occurrence = label_encoder.inverse_transform([prediction])[0]

    return fire_occurrence

    # Gradio Interface using components
interface = gr.Interface(
    fn=predict_wildfire,
    inputs=[
        gr.Number(label="Temperature (°C)"),
        gr.Number(label="Humidity (%)"),
        gr.Number(label="Wind Speed (km/h)"),
        gr.Number(label="Rainfall (mm)"),
        gr.Number(label="Fuel Moisture (%)"),
        gr.Dropdown(label="Vegetation Type", choices=["Grassland", "Forest", "Shrubland"]),
        gr.Number(label="Slope (%)"),
        gr.Dropdown(label="Region", choices=["North", "South", "East", "West"]),
        gr.Number(label="Fire Size (hectares)"),
        gr.Number(label="Fire Duration (hours)"),
        gr.Number(label="Suppression Cost ($)")
    ],
    outputs=[
        gr.Textbox(label="Fire Occurrence"),
        gr.number(label="Fire Size (hectares)"),
        gr.number(label="Fire Duration (hours)"),
        gr.number(label="Suppression Cost ($)")
    ],
    title="Wildfire Prediction",
    description="Enter the required details to predict wildfire occurrence, size, duration, and suppression cost."
)
if __name__ == "__main__":
    interface.launch()