# Data Preprocessing
This notebook handles data loading, cleaning, feature engineering, and data preparation for model training.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch

## Helper Function

In [3]:
# helper function for risk labels
def risk_label(row):
    if row['occured'] == 1 or row['fire_weather_index'] > 30:
        return 2
    elif row['fire_weather_index'] >= 10:
        return 1
    else:
        return 0

## Load + Clean + Label Data

In [4]:
def setup(data_src="final_dataset.csv"):
    # load data
    df = pd.read_csv(data_src)
    # remove NaN
    df.dropna(inplace=True)
    # for now, set negatives to 0. later reevalute for both negatives and high extremes
    df['fire_weather_index'] = df['fire_weather_index'].clip(lower=0)
    # risk labels based on occurred and FWI
    df['risk_level'] = df.apply(risk_label, axis=1)
    return df

## Split Data

In [5]:
def split_data(df):
    # split features and labels
    X = df.drop(columns=['risk_level', 'occured'])
    y = df['risk_level']
    # train/val/test split
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
    return X_train, X_val, X_test, y_train, y_val, y_test

## Normalize + Tensor Data

In [6]:
def norm_data(X_train, X_val, X_test, y_train, y_val, y_test):
    # normalize data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    # convert to tensors
    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    y_train_t = torch.tensor(y_train.values, dtype=torch.long)
    X_val_t = torch.tensor(X_val, dtype=torch.float32)
    y_val_t = torch.tensor(y_val.values, dtype=torch.long)
    X_test_t = torch.tensor(X_test, dtype=torch.float32)
    y_test_t = torch.tensor(y_test.values, dtype=torch.long)
    return X_train_t, y_train_t, X_val_t, y_val_t, X_test_t, y_test_t

In [7]:
def prepare_data(data_src="final_dataset.csv"):
    df = setup(data_src)
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df)
    X_train_t, y_train_t, X_val_t, y_val_t, X_test_t, y_test_t = norm_data(X_train, X_val, X_test, y_train, y_val, y_test)
    return X_train_t, y_train_t, X_val_t, y_val_t, X_test_t, y_test_t