# Data Preprocessing
This notebook handles data loading, cleaning, feature engineering, and data preparation for model training.

## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch

## Load Data

In [None]:
df = pd.read_csv("final_dataset.csv")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

## Remove Extreme Values

In [None]:
# Remove extreme values
df = df[(df['fire_weather_index'] >= 0) & (df['fire_weather_index'] < 60)]
print(f"Dataset shape after removing outliers: {df.shape}")

## Create Risk Labels

In [None]:
# Create risk labels based on occurred and FWI
def risk_label(row):
    if row['occured'] == 1 or row['fire_weather_index'] > 30:
        return 2
    elif row['fire_weather_index'] >= 10:
        return 1
    else:
        return 0

df['risk_level'] = df.apply(risk_label, axis=1)

# Count and print risk levels
risk_counts = df['risk_level'].value_counts().sort_index()
print("\nRisk Level Distribution:")
print(f"Low Risk (0): {risk_counts[0]} rows")
print(f"Medium Risk (1): {risk_counts[1]} rows")
print(f"High Risk (2): {risk_counts[2]} rows\n")

## Split Features and Labels

In [None]:
# Split features and labels
X = df.drop(columns=['risk_level', 'occured'])
y = df['risk_level']

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

## Train/Validation/Test Split

In [None]:
# Train/val/test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

## Normalize Data

In [None]:
# Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print("Data normalized successfully")

## Convert to PyTorch Tensors

In [None]:
# Convert to tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.long)
X_val_t = torch.tensor(X_val, dtype=torch.float32)
y_val_t = torch.tensor(y_val.values, dtype=torch.long)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.long)

print(f"X_train_t shape: {X_train_t.shape}")
print(f"y_train_t shape: {y_train_t.shape}")
print("\nData converted to PyTorch tensors successfully")