In [None]:
import xarray as xr
import pandas as pd
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from shapely.ops import unary_union
import os
import re
import numpy as np

In [None]:
import sys
print(sys.executable)

In [None]:
def convert_nc_to_csv(file_path):
    """
    Convert a .nc file to a CSV file with daily-averaged data filtered by a polygon.
    
    The CSV output is saved as:
        Cleaned_ENS/YYYY-MM/YYYY-MM-DD.csv

    Steps:
      1. Extract an 8-digit date (YYYYMMDD) from the input file path and format it as YYYY-MM-DD.
      2. Create a main output folder ("Cleaned_ENS") and a subfolder for the month (e.g., "2020-08").
      3. Read the NetCDF file and compute daily means.
      4. Convert the dataset to a DataFrame and create a "lat_lon" index.
      5. Read the polygon from "SD_gjson.json" and filter rows where the point is within the polygon.
      6. Write the filtered DataFrame to a CSV file.
    
    Args:
        file_path (str): Path to the .nc file.
    """
    # Extract date from file_path (expects an 8-digit date, e.g., 20200815)
    match = re.search(r'(\d{8})', file_path)
    if not match:
        raise ValueError("No valid date found in the file_path.")
    date_str = match.group(1)             # e.g., "20200815"
    formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"  # "2020-08-15"
    
    # Create the main output folder and a subfolder (e.g., "2020-08")
    main_folder = "Cleaned_ENS"
    sub_folder = formatted_date[:7]  # "YYYY-MM"
    output_folder = os.path.join(main_folder, sub_folder)
    
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Define the output file name
    output_file = os.path.join(output_folder, f"{formatted_date}.csv")
    
    # Open the dataset and compute daily means
    ds = xr.open_dataset(file_path)
    ds_daily = ds.resample(time="1D").mean()
    
    # Convert the dataset to a DataFrame and adjust the index
    beta = ds_daily.to_dataframe().reset_index()
    beta["lat_lon"] = list(zip(beta["latitude"], beta["longitude"]))
    beta = beta.set_index("lat_lon")
    beta = beta.drop(columns=["latitude", "longitude"])
    
    # Read the polygon from the GeoJSON file and filter the DataFrame
    sd_gdf = gpd.read_file("SD_gjson.json")
    sd_polygon = sd_gdf.unary_union
    beta["point"] = beta.index.map(lambda x: Point(x[1], x[0]))
    df = beta[beta["point"].apply(lambda pt: sd_polygon.contains(pt))].copy()
    df = df.drop(columns=["point"])

    # List of fire-related columns
    fire_cols = [
        'energy_release_component', 'ignition_component', 'fire_intensity_level',
        'forward_rate_of_spread', 'spread_component', 'burning_index', 'flame_length'
    ]

    for col in fire_cols:
        df[col + '_norm'] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    df['fire_risk_composite'] = df[[col + '_norm' for col in fire_cols]].mean(axis=1)

    df['fire_risk_score'] = df['fire_risk_composite'] * 100

    df['wind_speed'] = np.sqrt(df['eastward_10m_wind']**2 + df['northward_10m_wind']**2)

    predictor_cols = [
        'mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr', 
        'air_temperature_2m', 'air_relative_humidity_2m', 
        'wind_speed', 'accumulated_precipitation_amount',
        'surface_downwelling_shortwave_flux'
    ]

    final_cols = predictor_cols + ['fire_risk_score']

    df = df[final_cols]
    
    # Write the filtered DataFrame to CSV
    df.to_csv(output_file)
    print(f"Filtered CSV written to: {output_file}")

#convert_nc_to_csv('ens_gfs_001/2020-08/dfmnfdrs_202008152000Z.nc')

In [None]:
file_path = 'ens_gfs_001/2020-08/dfmnfdrs_202008152000Z.nc'

ds = xr.open_dataset(file_path)

ds_daily = ds.resample(time="1D").mean()

beta = ds_daily.to_dataframe().reset_index()

beta["lat_lon"] = list(zip(beta["latitude"], beta["longitude"]))

beta = beta.set_index("lat_lon")

beta = beta.drop(columns=["latitude", "longitude"])

beta

In [None]:
sd_gdf = gpd.read_file("SD_gjson.json")
sd_polygon = sd_gdf.unary_union
beta["point"] = beta.index.map(lambda x: Point(x[1], x[0]))
df_filtered = beta[beta["point"].apply(lambda pt: sd_polygon.contains(pt))].copy()
df_filtered = df_filtered.drop(columns=["point"])
df_filtered

In [None]:
to_drop = ['south_north', 'west_east', 'YYYY', 'MM', 'DD', 'HH', 'mean_wtd_moisture_100hr', 'mean_wtd_moisture_1000hr',]

In [None]:
df_filtered.columns

In [None]:
import numpy as np

df = df_filtered

# List of fire-related columns
fire_cols = [
    'energy_release_component', 'ignition_component', 'fire_intensity_level',
    'forward_rate_of_spread', 'spread_component', 'burning_index', 'flame_length'
]

for col in fire_cols:
    df[col + '_norm'] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

df['fire_risk_composite'] = df[[col + '_norm' for col in fire_cols]].mean(axis=1)

df['fire_risk_score'] = df['fire_risk_composite'] * 100

df['wind_speed'] = np.sqrt(df['eastward_10m_wind']**2 + df['northward_10m_wind']**2)

df

In [None]:
predictor_cols = ['mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr', 
                  'air_temperature_2m', 'air_relative_humidity_2m', 
                  'wind_speed', 'accumulated_precipitation_amount',
                  'surface_downwelling_shortwave_flux']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Drop rows with missing values (or you can choose to impute them)
df_model = df.dropna(subset=predictor_cols + ['fire_risk_score'])

# Split data into features and target
X = df_model[predictor_cols]
y = df_model['fire_risk_score']

# Optionally, scale the predictors (important for some models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit a regression model (here, a random forest regressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate performance (e.g., using RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

In [None]:
places_url = "https://www2.census.gov/geo/tiger/TIGER2020/PLACE/tl_2020_06_place.zip"
gdf_places = gpd.read_file(places_url)

counties_url = "https://www2.census.gov/geo/tiger/TIGER2020/COUNTY/tl_2020_us_county.zip"
gdf_counties = gpd.read_file(counties_url)
gdf_sd_county = gdf_counties[gdf_counties['NAME'] == 'San Diego']

gdf_places = gdf_places.to_crs("EPSG:4326")
gdf_sd_county = gdf_sd_county.to_crs("EPSG:4326")

# Perform a spatial join: this will attach county info to places that intersect SD county
gdf_sd_places = gpd.sjoin(gdf_places, gdf_sd_county, how="inner", predicate="intersects")

gdf_sd_places

In [None]:
subdiv_url = "https://www2.census.gov/geo/tiger/TIGER2020/COUSUB/tl_2020_06_cousub.zip"
gdf_subdiv = gpd.read_file(subdiv_url)

gdf_sd = gdf_subdiv[gdf_subdiv['COUNTYFP'] == '073']

gdf_sd

In [None]:
import folium
import pandas as pd

# --- Plot your points ---
# Extract latitudes and longitudes from the DataFrame's index (assumed to be (lat, lon) tuples)
lats = df_filtered.index.map(lambda x: x[0])
lons = df_filtered.index.map(lambda x: x[1])
center_lat = pd.Series(lats).mean()
center_lon = pd.Series(lons).mean()

# Create the map centered on your points
m = folium.Map(location=[center_lat, center_lon], zoom_start=7)

# Add each point as a red CircleMarker
for lat, lon in zip(lats, lons):
    folium.CircleMarker(
        location=[lat, lon],
        radius=3,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7
    ).add_to(m)

# --- Add Polygon Layers ---
# First, add the subdivisions layer (lower priority)
folium.GeoJson(
    gdf_sd.to_json(),
    name="Subdivisions",
    style_function=lambda feature: {
        "fillColor": "lightgreen",
        "color": "green",
        "weight": 2,
        "fillOpacity": 0.3,
    },
    tooltip=folium.GeoJsonTooltip(
        fields=["NAME"],
        aliases=["Subdivision:"]
    )
).add_to(m)

# Next, add the incorporated places layer (higher priority)
# Use "NAME_left" because that field exists in your gdf_sd_places
folium.GeoJson(
    gdf_sd_places.to_json(),
    name="Incorporated Places",
    style_function=lambda feature: {
        "fillColor": "lightblue",
        "color": "blue",
        "weight": 2,
        "fillOpacity": 0.3,
    },
    tooltip=folium.GeoJsonTooltip(
        fields=["NAME_left"],
        aliases=["Place:"]
    )
).add_to(m)

# Add a layer control to toggle layers on/off
folium.LayerControl().add_to(m)

m

# Working with Cleaned_ENS

In [None]:
import glob

csv_folder_path = "Cleaned_ENS/2022-08"

csv_files = glob.glob(os.path.join(csv_folder_path, "*.csv"))

df_list = []
for file in csv_files:
    temp_df = pd.read_csv(file)  # Adjust 'sep' if needed
    df_list.append(temp_df)

combined_df = pd.concat(df_list, ignore_index=True)

combined_df[['latitude','longitude']] = (
    combined_df['lat_lon']
    .str.strip('()')           # remove parentheses
    .str.split(',', expand=True)
    .apply(lambda col: pd.to_numeric(col, errors='coerce'))
)

combined_df.drop('lat_lon', axis=1, inplace=True)

combined_df

In [None]:
import glob
import pandas as pd
import time
import os

def combine(folder_path):
    df_list = []

    for subfolder in [f.path for f in os.scandir(folder_path) if f.is_dir()]:
        start_time = time.time()
        csv_files = glob.glob(os.path.join(subfolder, "*.csv"))
        for file in csv_files:
            temp_df = pd.read_csv(file)  # Adjust 'sep' if needed

            # Process lat_lon into latitude and longitude
            temp_df[['latitude', 'longitude']] = (
                temp_df['lat_lon']
                .str.strip('()')  # Remove parentheses
                .str.split(',', expand=True)
                .apply(lambda col: pd.to_numeric(col, errors='coerce'))
            )

            temp_df.drop('lat_lon', axis=1, inplace=True)

            df_list.append(temp_df)
        end_time = time.time()
        print(f"Execution time: {end_time - start_time:.4f} seconds")
    final_df = pd.concat(df_list, ignore_index=True)

    return final_df

In [None]:
total_data = combine('Cleaned_ENS')
total_data

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import time
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch_optimizer import Lamb, Ranger

class HumidityAttentionMLP(nn.Module):
    def __init__(self, num_other_features=8, embed_dim=128, num_heads=8, mlp_hidden=512, dropout=0.2):
        super().__init__()
        self.humidity_embedding = nn.Linear(1, embed_dim)
        self.other_embedding = nn.Linear(1, embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.fc1 = nn.Sequential(
            nn.Linear(embed_dim, mlp_hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, mlp_hidden),  # Additional layer
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.norm2 = nn.LayerNorm(mlp_hidden)
        self.fc2 = nn.Linear(mlp_hidden, 1)

    def forward(self, humidity, other_features):
        humidity = humidity.unsqueeze(-1)
        humidity_emb = self.humidity_embedding(humidity)
        other_features = other_features.unsqueeze(-1)
        other_emb = self.other_embedding(other_features)
        attn_output, attn_weights = self.attention(humidity_emb, other_emb, other_emb)
        attn_output = self.norm1(attn_output.squeeze(1))
        x = self.fc1(attn_output)
        x = self.norm2(x)
        out = self.fc2(x)
        return out, attn_weights

# Define which columns will serve as the other features (key/value)
other_features_cols = [
    'mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr',
    'air_temperature_2m', 'wind_speed',
    'accumulated_precipitation_amount', 'surface_downwelling_shortwave_flux'
]

# Columns for latitude and longitude
geo_features_cols = ['latitude', 'longitude']

from sklearn.model_selection import train_test_split

# Split dataset into train (80%) and validation (20%)
train_data, val_data = train_test_split(total_data, test_size=0.2, random_state=42)

# Normalize input features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_other = torch.tensor(scaler.fit_transform(train_data[other_features_cols].values), dtype=torch.float32)
val_other = torch.tensor(scaler.transform(val_data[other_features_cols].values), dtype=torch.float32)

# Keep latitude and longitude in their original form
train_geo = torch.tensor(train_data[geo_features_cols].values, dtype=torch.float32)
val_geo = torch.tensor(val_data[geo_features_cols].values, dtype=torch.float32)

# Combine scaled features with unscaled latitude and longitude
train_other = torch.cat([train_other, train_geo], dim=1)
val_other = torch.cat([val_other, val_geo], dim=1)

train_humidity = torch.tensor(scaler.fit_transform(train_data[['air_relative_humidity_2m']].values), dtype=torch.float32)
val_humidity = torch.tensor(scaler.transform(val_data[['air_relative_humidity_2m']].values), dtype=torch.float32)

# Normalize target labels
train_labels = torch.tensor(scaler.fit_transform(train_data[['fire_risk_score']].values), dtype=torch.float32)
val_labels = torch.tensor(scaler.transform(val_data[['fire_risk_score']].values), dtype=torch.float32)

train_other[0]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import time
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
from torch_optimizer import Lamb, Ranger

class HumidityAttentionMLP(nn.Module):
    def __init__(self, num_other_features=8, embed_dim=128, num_heads=8, mlp_hidden=512, dropout=0.2):
        super().__init__()
        self.humidity_embedding = nn.Linear(1, embed_dim)
        self.other_embedding = nn.Linear(1, embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.fc1 = nn.Sequential(
            nn.Linear(embed_dim, mlp_hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, mlp_hidden),  # Additional layer
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.norm2 = nn.LayerNorm(mlp_hidden)
        self.fc2 = nn.Linear(mlp_hidden, 1)

    def forward(self, humidity, other_features):
        humidity = humidity.unsqueeze(-1)
        humidity_emb = self.humidity_embedding(humidity)
        other_features = other_features.unsqueeze(-1)
        other_emb = self.other_embedding(other_features)
        attn_output, attn_weights = self.attention(humidity_emb, other_emb, other_emb)
        attn_output = self.norm1(attn_output.squeeze(1))
        x = self.fc1(attn_output)
        x = self.norm2(x)
        out = self.fc2(x)
        return out, attn_weights

# Define which columns will serve as the other features (key/value)
other_features_cols = [
    'mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr',
    'air_temperature_2m', 'wind_speed',
    'accumulated_precipitation_amount', 'surface_downwelling_shortwave_flux'
]

# Columns for latitude and longitude
geo_features_cols = ['latitude', 'longitude']

from sklearn.model_selection import train_test_split

# Split dataset into train (80%) and validation (20%)
train_data, val_data = train_test_split(total_data, test_size=0.2, random_state=42)

# Normalize input features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_other = torch.tensor(scaler.fit_transform(train_data[other_features_cols].values), dtype=torch.float32)
val_other = torch.tensor(scaler.transform(val_data[other_features_cols].values), dtype=torch.float32)

# Keep latitude and longitude in their original form
train_geo = torch.tensor(train_data[geo_features_cols].values, dtype=torch.float32)
val_geo = torch.tensor(val_data[geo_features_cols].values, dtype=torch.float32)

# Combine scaled features with unscaled latitude and longitude
train_other = torch.cat([train_other, train_geo], dim=1)
val_other = torch.cat([val_other, val_geo], dim=1)

train_humidity = torch.tensor(scaler.fit_transform(train_data[['air_relative_humidity_2m']].values), dtype=torch.float32)
val_humidity = torch.tensor(scaler.transform(val_data[['air_relative_humidity_2m']].values), dtype=torch.float32)

# Normalize target labels
train_labels = torch.tensor(scaler.fit_transform(train_data[['fire_risk_score']].values), dtype=torch.float32)
val_labels = torch.tensor(scaler.transform(val_data[['fire_risk_score']].values), dtype=torch.float32)


# Custom Dataset class
class FireRiskDataset(Dataset):
    def __init__(self, humidity_tensor, other_tensor, label_tensor):
        self.humidity = humidity_tensor
        self.other_features = other_tensor
        self.labels = label_tensor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.humidity[idx], self.other_features[idx], self.labels[idx]

train_dataset = FireRiskDataset(train_humidity, train_other, train_labels)
val_dataset = FireRiskDataset(val_humidity, val_other, val_labels)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# Verify tensor shapes
print("Humidity tensor shape:", train_humidity.shape)  # (num_samples,)
print("Other features tensor shape:", train_other.shape)  # (num_samples, 8)
print("Labels tensor shape:", train_labels.shape)  # (num_samples, 1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Initialize the model and move it to GPU
model = HumidityAttentionMLP()
model.to(device)

# Define loss function and optimizer
criterion = nn.HuberLoss()
optimizer = Lamb(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)

num_epochs = 20
train_losses = []
val_losses = []

# Early stopping
best_val_loss = float('inf')
patience = 5
epochs_without_improvement = 0

import time

for epoch in range(num_epochs):
    # Start timer for the entire epoch
    epoch_start_time = time.time()
    
    # Initialize point counter and timer for the 1,000,000 point intervals
    point_counter = 0
    point_timer = time.time()
    
    model.train()
    total_train_loss = 0.0

    for batch_humidity, batch_other, batch_labels in train_loader:
        # Move data to device
        batch_humidity = batch_humidity.to(device)
        batch_other = batch_other.to(device)
        batch_labels = batch_labels.to(device)
        
        optimizer.zero_grad()
        outputs, _ = model(batch_humidity, batch_other)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_train_loss += loss.item() * batch_labels.size(0)
        
        # Update counter with number of points in the current batch
        point_counter += batch_labels.size(0)
        
        # Check if we've processed 1,000,000 points
        if point_counter >= 1_000_000:
            points_elapsed_time = time.time() - point_timer
            print(f"Processed 1,000,000 points in {points_elapsed_time:.2f} seconds")
            # Reset the point timer and counter for the next interval
            point_timer = time.time()
            point_counter = 0

    epoch_train_loss = total_train_loss / len(train_dataset)
    train_losses.append(epoch_train_loss)

    # Evaluate on validation set
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for batch_humidity, batch_other, batch_labels in val_loader:
            batch_humidity = batch_humidity.to(device)
            batch_other = batch_other.to(device)
            batch_labels = batch_labels.to(device)
            outputs, _ = model(batch_humidity, batch_other)
            loss = criterion(outputs, batch_labels)
            total_val_loss += loss.item() * batch_labels.size(0)

    epoch_val_loss = total_val_loss / len(val_dataset)
    val_losses.append(epoch_val_loss)

    # Check for early stopping
    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print("Early stopping triggered!")
        break

    # Step the scheduler after each epoch
    scheduler.step()

    # End of epoch: calculate and print total epoch time
    epoch_elapsed_time = time.time() - epoch_start_time
    print(f"Epoch {epoch + 1} finished in {epoch_elapsed_time:.2f} seconds")

In [None]:
print(val_losses)

In [None]:
import torch

# Set the model to evaluation mode
model.eval()

# Select a single test sample (modify the index as needed)
test_idx = 865356  # Change index as needed
test_humidity, test_other, test_label = val_dataset[test_idx]

# Move data to the correct device
test_humidity = test_humidity.unsqueeze(0).to(device)  # Add batch dimension
test_other = test_other.unsqueeze(0).to(device)  # Add batch dimension
test_label = test_label.unsqueeze(0).to(device)

# Disable gradient calculations for inference
with torch.no_grad():
    output, attn_weights = model(test_humidity, test_other)

# Convert output and label to CPU NumPy arrays
output_np = output.cpu().numpy()
test_label_np = test_label.cpu().numpy()

# Undo normalization (Inverse transform for scaled features)
output_original = scaler.inverse_transform(output_np)  # Convert output back to original scale
test_label_original = scaler.inverse_transform(test_label_np)  # Convert label back to original scale

# Latitude and longitude are already in original scale
test_geo_original = test_other[:, -2:].cpu().numpy()  # Extract latitude and longitude
test_other_original = test_other.cpu().numpy()

# Print the results
print(f"Predicted fire risk score (original scale): {output_original}")
print(f"Actual fire risk score (original scale): {test_label_original}")
print(f"Latitude and Longitude (original scale): {test_geo_original}")
test_other_original

In [None]:
torch.save(model.state_dict(), 'humidity_attention_mlp.pth')

# Model Prediction

In [None]:
convert_nc_to_csv('dfmnfdrs_202502281200Z.nc')

In [None]:
df = pd.read_csv('Cleaned_ENS/2025-02/2025-02-28.csv')
df

In [None]:
import glob

csv_folder_path = "Cleaned_ENS/2025-02"

csv_files = glob.glob(os.path.join(csv_folder_path, "*.csv"))

df_list = []
for file in csv_files:
    temp_df = pd.read_csv(file)  # Adjust 'sep' if needed
    df_list.append(temp_df)

combined_df = pd.concat(df_list, ignore_index=True)

combined_df[['latitude','longitude']] = (
    combined_df['lat_lon']
    .str.strip('()')           # remove parentheses
    .str.split(',', expand=True)
    .apply(lambda col: pd.to_numeric(col, errors='coerce'))
)

combined_df.drop('lat_lon', axis=1, inplace=True)

predict_df = combined_df

predict_df

In [None]:
other_features_cols = [
    'mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr',
    'air_temperature_2m', 'wind_speed',
    'accumulated_precipitation_amount', 'surface_downwelling_shortwave_flux'
]

# Columns for latitude and longitude
geo_features_cols = ['latitude', 'longitude']

from sklearn.preprocessing import StandardScaler
predict_other = torch.tensor(scaler.fit_transform(predict_df[other_features_cols].values), dtype=torch.float32)

predict_humidity = torch.tensor(scaler.fit_transform(predict_df[['air_relative_humidity_2m']].values), dtype=torch.float32)

predict_labels = torch.tensor(scaler.fit_transform(predict_df[['fire_risk_score']].values), dtype=torch.float32)

predict_labels[0]

In [None]:
class FireRiskDataset(Dataset):
    def __init__(self, humidity_tensor, other_tensor, label_tensor):
        self.humidity = humidity_tensor
        self.other_features = other_tensor
        self.labels = label_tensor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.humidity[idx], self.other_features[idx], self.labels[idx]

In [None]:
predict_dataset = FireRiskDataset(predict_humidity, predict_other, predict_labels)

In [None]:
import joblib

other_features_cols = [
    'mean_wtd_moisture_1hr', 'mean_wtd_moisture_10hr',
    'air_temperature_2m', 'wind_speed',
    'accumulated_precipitation_amount', 'surface_downwelling_shortwave_flux'
]

# Columns for latitude and longitude
geo_features_cols = ['latitude', 'longitude']

predict_other = torch.tensor(scaler.fit_transform(predict_df[other_features_cols].values), dtype=torch.float32)
predict_geo = torch.tensor(predict_df[geo_features_cols].values, dtype=torch.float32)
predict_other = torch.cat([predict_other, predict_geo], dim=1)

predict_humidity = torch.tensor(scaler.fit_transform(predict_df[['air_relative_humidity_2m']].values), dtype=torch.float32)

predict_labels = torch.tensor(scaler.fit_transform(predict_df[['fire_risk_score']].values), dtype=torch.float32)

class FireRiskDataset(Dataset):
    def __init__(self, humidity_tensor, other_tensor, label_tensor):
        self.humidity = humidity_tensor
        self.other_features = other_tensor
        self.labels = label_tensor

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.humidity[idx], self.other_features[idx], self.labels[idx]

predict_dataset = FireRiskDataset(predict_humidity, predict_other, predict_labels)

class HumidityAttentionMLP(nn.Module):
    def __init__(self, num_other_features=8, embed_dim=128, num_heads=8, mlp_hidden=512, dropout=0.2):
        super().__init__()
        self.humidity_embedding = nn.Linear(1, embed_dim)
        self.other_embedding = nn.Linear(1, embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.fc1 = nn.Sequential(
            nn.Linear(embed_dim, mlp_hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden, mlp_hidden),  # Additional layer
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.norm2 = nn.LayerNorm(mlp_hidden)
        self.fc2 = nn.Linear(mlp_hidden, 1)

    def forward(self, humidity, other_features):
        humidity = humidity.unsqueeze(-1)
        humidity_emb = self.humidity_embedding(humidity)
        other_features = other_features.unsqueeze(-1)
        other_emb = self.other_embedding(other_features)
        attn_output, attn_weights = self.attention(humidity_emb, other_emb, other_emb)
        attn_output = self.norm1(attn_output.squeeze(1))
        x = self.fc1(attn_output)
        x = self.norm2(x)
        out = self.fc2(x)
        return out, attn_weights

# Load the saved state dictionary
#model.load_state_dict(torch.load('models/humidity_attention_mlp.pth'))

# Move the model to the appropriate device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set the model to evaluation mode
model.eval()

predict_idx = 0
predict_humidity, predict_other, predict_label = predict_dataset[predict_idx]

# Move data to the correct device
predict_humidity = predict_humidity.unsqueeze(0).to(device)  # Add batch dimension
predict_other = predict_other.unsqueeze(0).to(device)  # Add batch dimension
predict_label = predict_label.unsqueeze(0).to(device)

# Disable gradient calculations for inference
with torch.no_grad():
    predict, attn_weights = model(predict_humidity, predict_other)

print(output)
# Convert output and label to CPU NumPy arrays
predict_np = predict.cpu().numpy()
preict_label_np = predict_label.cpu().numpy()

# Undo normalization (Inverse transform for scaled features)
predict_original = scaler.inverse_transform(predict_np)  # Convert output back to original scale
predict_label_original = scaler.inverse_transform(preict_label_np)  # Convert label back to original scale

# Latitude and longitude are already in original scale
predict_geo_original = predict_other[:, -2:].cpu().numpy()  # Extract latitude and longitude

# Print the results
print(f"Predicted fire risk score (original scale): {predict_original}")
print(f"Actual fire risk score (original scale): {predict_label_original}")
print(f"Latitude and Longitude (original scale): {predict_geo_original}")

output_original

In [None]:
import pandas as pd

# Ensure the model is in evaluation mode
model.eval()

results = []  # To store [latitude, longitude, predicted fire risk, actual fire risk]

# Disable gradient calculations for inference
with torch.no_grad():
    for predict_humidity, predict_other, predict_label in predict_dataset:
        # Add batch dimension and move data to device
        predict_humidity = predict_humidity.unsqueeze(0).to(device)
        predict_other = predict_other.unsqueeze(0).to(device)
        predict_label = predict_label.unsqueeze(0).to(device)

        # Run the model to get the prediction
        predict, attn_weights = model(predict_humidity, predict_other)

        # Convert predictions and labels to CPU NumPy arrays
        predict_np = predict.cpu().numpy()
        predict_label_np = predict_label.cpu().numpy()

        # Undo normalization (inverse transform for scaled features)
        predict_original = scaler.inverse_transform(predict_np)
        predict_label_original = scaler.inverse_transform(predict_label_np)

        # Extract latitude and longitude (assuming they are the last two features in predict_other)
        predict_geo_original = predict_other[:, -2:].cpu().numpy()

        # Extract values (assuming single value predictions/labels)
        fire_risk_pred = predict_original[0, 0]
        fire_risk_actual = predict_label_original[0, 0]
        latitude = predict_geo_original[0, 0]
        longitude = predict_geo_original[0, 1]

        # Append the results as a row [latitude, longitude, predicted fire risk, actual fire risk]
        results.append([latitude, longitude, fire_risk_pred])

# Create a DataFrame with the results and specify the column names
df_predictions = pd.DataFrame(results, columns=['Latitude', 'Longitude', 'Predicted_Fire_Risk'])

# Save the DataFrame to a CSV file without the index column
df_predictions.to_csv('predictions.csv', index=False)

print("CSV file 'predictions.csv' has been created with predictions and actual labels.")