In [None]:
import numpy as np
import pandas as pd
import networkx as nx

import geopandas as gpd
from shapely.geometry import Point

import folium
from folium.plugins import HeatMap

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer


import warnings
warnings.filterwarnings("ignore")

## **This notebook analyzes the following variables: Conductor Material, Type, Age, and Work Order Date**

Uses the same base as 'living_life_code.ipynb' by Judy

## 1. Preprocess the datasets

In [None]:
# vri_df = pd.read_csv('data/src_vri_snapshot_2024_03_20.csv')
span_df = pd.read_csv('data/dev_wings_agg_span_2024_01_01.csv')
gis_2024_1004 = pd.read_csv('data/gis_weatherstation_shape_2024_10_04.csv')
station_summary_2023_08_02 = pd.read_csv('data/src_wings_meteorology_station_summary_snapshot_2023_08_02.csv')
windspeed_2023_08_02 = pd.read_csv('data/src_wings_meteorology_windspeed_snapshot_2023_08_02.csv')

In [None]:
merged_df = pd.merge(station_summary_2023_08_02, gis_2024_1004, right_on= 'weatherstationcode', left_on='station', how='left')
merged_df.head()

In [None]:
windspeed_grouped_count = windspeed_2023_08_02.groupby(by='station').count()
windspeed_grouped_count.head()

In [None]:
station_codes = np.array(gis_2024_1004['weatherstationcode'])
merged_station_df = gis_2024_1004.merge(station_summary_2023_08_02, left_on='weatherstationcode', right_on='station', how='left')


In [None]:
merged_df[merged_df['weatherstationcode']=='AMO']['alert'].iloc[0]

prob_lst = []

for station in station_codes:
    station_windspeeds = np.array(windspeed_2023_08_02[windspeed_2023_08_02['station'] == station]['wind_speed'])
    # "alert" might be nan because of less entries in station_ss_df
    has_threshold = True
    try:
        threshold = merged_df[merged_df['weatherstationcode'] == station]['alert'].iloc[0]
    except:
        has_threshold = False
        prob = np.nan
    mean = np.nanmean(station_windspeeds)
    if has_threshold:
        prob = np.mean([1 if x >= threshold else 0 for x in station_windspeeds]) * 100
    count = np.count_nonzero(~np.isnan(station_windspeeds))
    prob_lst.append([station, station_windspeeds, threshold, count, mean, prob])

In [None]:
prob_df = pd.DataFrame(prob_lst)
prob_df.columns = ['station', 'windspeeds', 'threshold', 'count', 'mean', 'probability (%)']
prob_df.head()

In [None]:
gis_2024_1004['shape'] = gpd.GeoSeries.from_wkt(gis_2024_1004['shape'])
gis_gdf = gpd.GeoDataFrame(gis_2024_1004, geometry='shape').set_crs(epsg=4431).to_crs(epsg=4326)

# vri_df['shape'] = gpd.GeoSeries.from_wkt(vri_df['shape'])
# vri_gdf = gpd.GeoDataFrame(vri_df, geometry='shape').set_crs(epsg=4326)

span_df['shape'] = gpd.GeoSeries.from_wkt(span_df['shape'])
span_gdf = gpd.GeoDataFrame(span_df, geometry='shape').set_crs(epsg=2230).to_crs(epsg=4326)

In [None]:
gis_gdf = gis_gdf.drop(columns=['shape_srid'])
# vri_gdf = vri_gdf.drop(columns=['shape_srid'])
span_gdf = span_gdf.drop(columns=['shape_srid'])


In [None]:
# gis_vri_merge = gis_gdf.merge(vri_gdf, left_on='weatherstationcode', right_on='anemometercode')
# vri_gdf['centroid'] = vri_gdf['shape'].centroid
# vri_gis_sjoin = vri_gdf.sjoin(gis_gdf, how='inner')

In [None]:
prob_merge = gis_gdf.merge(prob_df, left_on='weatherstationcode', right_on='station').merge(station_summary_2023_08_02, left_on='weatherstationcode', right_on='station')
prob_merge

In [None]:
# vri_mapping = {'H': 2, 'M': 1, 'L': 0}
# prob_merge['vri_numeric'] = prob_merge['vri'].map(vri_mapping)

In [None]:
# prob_merge.head()

In [None]:
span_df_cleaned = span_df.dropna(subset=['station'])
span_prob_merge_df = prob_merge.merge(span_df_cleaned, left_on='weatherstationcode', right_on='station')
span_prob_merge_df

In [None]:
print(list(span_prob_merge_df.columns))

In [None]:
columns_to_keep = [
    'probability (%)',
    'hardened_state', 'miles', 
    'upstream_struct_age', 'upstream_struct_hftd', 'upstream_struct_material', 'upstream_struct_type', 'upstream_struct_workorderdate',
    'downstream_struct_age', 'downstream_struct_hftd', 'downstream_struct_material', 'downstream_struct_type', 'downstream_struct_workorderdate',
    'wire_risk',
]

df_filtered = span_prob_merge_df[columns_to_keep]

# columns_to_impute = ['num_strike_trees', 'buffered_tree_counts', 'exclusive_tree_counts']
# for col in columns_to_impute:
#     min_val, max_val = df_filtered[col].min(), df_filtered[col].max()
#     df_filtered[col] = df_filtered[col].apply(
#         lambda x: np.random.randint(min_val, max_val + 1) if pd.isna(x) else x
#     )

df_filtered.shape

In [None]:
# fix 'workorderdate' datatype from str to datetime
df_filtered['upstream_struct_workorderdate'] = df_filtered['upstream_struct_workorderdate'].replace('NaN',np.nan)
df_filtered['downstream_struct_workorderdate'] = df_filtered['downstream_struct_workorderdate'].replace('NaN',np.nan)

df_filtered['upstream_struct_workorderdate'] = pd.to_datetime(df_filtered['upstream_struct_workorderdate'],format='%Y-%m-%d')
df_filtered['downstream_struct_workorderdate'] = pd.to_datetime(df_filtered['downstream_struct_workorderdate'],format='%Y-%m-%d')

In [None]:
# calculate the days past since the last work order
upstream_workorder_days = pd.Timestamp.now() - df_filtered['upstream_struct_workorderdate']
df_filtered['days_since_upstream_workorder'] = [i.days for i in upstream_workorder_days]
downstream_workorder_days = pd.Timestamp.now() - df_filtered['downstream_struct_workorderdate']
df_filtered['days_since_downstream_workorder'] = [i.days for i in downstream_workorder_days]

# drop original columns
df_filtered = df_filtered.drop(columns=['upstream_struct_workorderdate', 'downstream_struct_workorderdate'])

In [None]:
# one-hot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
categorical_columns = ['hardened_state', 'wire_risk',
                       'upstream_struct_type', 'downstream_struct_type', 'downstream_struct_material', 'upstream_struct_material']
one_hot_encoded = encoder.fit_transform(df_filtered[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([df_filtered.drop(categorical_columns, axis=1), one_hot_df], axis=1)

In [None]:
df_encoded

In [None]:
df_encoded.columns

## 2. EDA

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df_filtered['probability (%)'], bins=30, kde=True, color='blue')
plt.title("Distribution of Wildfire Probability (%)")
plt.xlabel("Probability (%)")
plt.ylabel("Frequency")
plt.show()


In [None]:
sns.barplot(data=df_filtered, x='upstream_struct_type', y='probability (%)')

In [None]:
sns.barplot(data=df_filtered, x='downstream_struct_type', y='probability (%)')

In [None]:
sorted_material = df_filtered['upstream_struct_material'].unique()
sorted_material

In [None]:
sns.barplot(data=df_filtered, x='upstream_struct_material', y='probability (%)', order=sorted_material)

In [None]:
sns.barplot(data=df_filtered, x='downstream_struct_material', y='probability (%)', order=sorted_material)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

sorted_type = df_filtered['upstream_struct_type'].unique()
sorted_material = df_filtered['upstream_struct_material'].unique()

sns.barplot(data=df_filtered, x='upstream_struct_type', y='probability (%)', order=sorted_type, ax=axes[0, 0], color='blue')
axes[0, 0].set_title("Upstream Structure Type vs Wildfire Probability")

sns.barplot(data=df_filtered, x='downstream_struct_type', y='probability (%)', order=sorted_type, ax=axes[1, 0], color='red')
axes[1, 0].set_title("Downstream Struct Type vs Wildfire Probability")

sns.barplot(data=df_filtered, x='upstream_struct_material', y='probability (%)', order=sorted_material, ax=axes[0, 1], color='blue')
axes[0, 1].set_title("Upstream Structure Material vs Wildfire Probability")

sns.barplot(data=df_filtered, x='downstream_struct_material', y='probability (%)', order=sorted_material, ax=axes[1, 1], color='red')
axes[1, 1].set_title("Downstream Struct Material vs Wildfire Probability")

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

sns.scatterplot(data=df_filtered, x='wire_risk', y='probability (%)', ax=axes[0, 0], color='orange')
axes[0, 0].set_title("Wire Risk vs Wildfire Probability")

sns.scatterplot(data=df_filtered, x='upstream_struct_age', y='probability (%)', ax=axes[0, 1], color='blue')
axes[0, 1].set_title("Upstream Struct Age vs Wildfire Probability")

sns.scatterplot(data=df_filtered, x='miles', y='probability (%)', ax=axes[1, 0], color='green')
axes[1, 0].set_title("Miles vs Wildfire Probability")

sns.scatterplot(data=df_filtered, x='downstream_struct_age', y='probability (%)', ax=axes[1, 1], color='red')
axes[1, 1].set_title("Downstream Struct Age vs Wildfire Probability")

plt.tight_layout()
plt.show()

## 3. ML models

In [None]:
# columns_to_impute = ['num_strike_trees', 'buffered_tree_counts', 'exclusive_tree_counts']
# for col in columns_to_impute:
#     min_val, max_val = df_encoded[col].min(), df_encoded[col].max()
#     df_encoded[col] = df_encoded[col].apply(
#         lambda x: np.random.randint(min_val, max_val + 1) if pd.isna(x) else x
#     )

X = df_encoded.drop(columns=['probability (%)'])
y = df_encoded['probability (%)']

imputer = SimpleImputer(strategy="mean")
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=25, random_state=30),
    # "Support Vector Regressor": SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1),
    # "MLP Regressor": MLPRegressor(hidden_layer_sizes=(50, 50), learning_rate_init=0.01, max_iter=300, random_state=42)
}

results = []
for name, model in models.items():
    if name in ["Random Forest", "Linear Regression"]:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({"Model": name, "MAE": mae, "MSE": mse, "R² Score": r2})

results_df = pd.DataFrame(results)
results_df

In [None]:
results = []
for i in np.arange(1, 50, 5):
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=i, random_state=30)
    rf_model.fit(X_train_scaled, y_train)
    y_pred = rf_model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({"Model": name, "MAE": mae, "MSE": mse, "R² Score": r2})

results_df = pd.DataFrame(results)
results_df

In [None]:
np.arange(1, 50, 5)

In [None]:
# svr_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
# svr_model.fit(X_train_scaled, y_train)
# y_pred = svr_model.predict(X_test_scaled)
# mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print('SVR model')
# print(f'MAE: {mae}, MSE: {mse}, R2: {r2}')

## Using the fire index from `ens_gfs` data

In [None]:
df_filtered.columns