In [2]:
import numpy as np
import pandas as pd
import networkx as nx

import geopandas as gpd
from shapely.geometry import Point

import folium
from folium.plugins import HeatMap

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.impute import SimpleImputer


import warnings
warnings.filterwarnings("ignore")



In [3]:
vri_df = pd.read_csv('src_vri_snapshot_2024_03_20.csv')
span_df = pd.read_csv('dev_wings_agg_span_2024_01_01.csv')
gis_2024_1004 = pd.read_csv('gis_weatherstation_shape_2024_10_04.csv')
station_summary_2023_08_02 = pd.read_csv('src_wings_meteorology_station_summary_snapshot_2023_08_02.csv')
windspeed_2023_08_02 = pd.read_csv('src_wings_meteorology_windspeed_snapshot_2023_08_02.csv')


merged_df = pd.merge(station_summary_2023_08_02, gis_2024_1004, right_on= 'weatherstationcode', left_on='station', how='left')

windspeed_grouped_count = windspeed_2023_08_02.groupby(by='station').count()

station_codes = np.array(gis_2024_1004['weatherstationcode'])
merged_station_df = gis_2024_1004.merge(station_summary_2023_08_02, left_on='weatherstationcode', right_on='station', how='left')

merged_df[merged_df['weatherstationcode']=='AMO']['alert'].iloc[0]

prob_lst = []

for station in station_codes:
    station_windspeeds = np.array(windspeed_2023_08_02[windspeed_2023_08_02['station'] == station]['wind_speed'])
    # "alert" might be nan because of less entries in station_ss_df
    has_threshold = True
    try:
        threshold = merged_df[merged_df['weatherstationcode'] == station]['alert'].iloc[0]
    except:
        has_threshold = False
        prob = np.nan
    mean = np.nanmean(station_windspeeds)
    if has_threshold:
        prob = np.mean([1 if x >= threshold else 0 for x in station_windspeeds]) * 100
    count = np.count_nonzero(~np.isnan(station_windspeeds))
    prob_lst.append([station, station_windspeeds, threshold, count, mean, prob])


prob_df = pd.DataFrame(prob_lst)
prob_df.columns = ['station', 'windspeeds', 'threshold', 'count', 'mean', 'probability (%)']

gis_2024_1004['shape'] = gpd.GeoSeries.from_wkt(gis_2024_1004['shape'])
gis_gdf = gpd.GeoDataFrame(gis_2024_1004, geometry='shape').set_crs(epsg=4431).to_crs(epsg=4326)

vri_df['shape'] = gpd.GeoSeries.from_wkt(vri_df['shape'])
vri_gdf = gpd.GeoDataFrame(vri_df, geometry='shape').set_crs(epsg=4326)

span_df['shape'] = gpd.GeoSeries.from_wkt(span_df['shape'])
span_gdf = gpd.GeoDataFrame(span_df, geometry='shape').set_crs(epsg=2230).to_crs(epsg=4326)

gis_gdf = gis_gdf.drop(columns=['shape_srid'])
vri_gdf = vri_gdf.drop(columns=['shape_srid'])
span_gdf = span_gdf.drop(columns=['shape_srid'])

gis_vri_merge = gis_gdf.merge(vri_gdf, left_on='weatherstationcode', right_on='anemometercode')
vri_gdf['centroid'] = vri_gdf['shape'].centroid
vri_gis_sjoin = vri_gdf.sjoin(gis_gdf, how='inner')

prob_merge = vri_gis_sjoin.merge(prob_df, left_on='weatherstationcode', right_on='station').merge(station_summary_2023_08_02, left_on='weatherstationcode', right_on='station')

vri_mapping = {'H': 2, 'M': 1, 'L': 0}
prob_merge['vri_numeric'] = prob_merge['vri'].map(vri_mapping)

span_df_cleaned = span_df.dropna(subset=['station'])
span_vri_prob_merge_df = prob_merge.merge(span_df_cleaned, left_on='anemometercode', right_on='station')

columns_to_keep = [
    'probability (%)',
    'vri_numeric', 'elevation', 'longitude', 'latitude',
    'cust_total', 'cust_lifesupport', 'cust_urgent', 'cust_medicalcert', 'cust_essential',
    'cust_sensitive', 'cust_residential', 'cust_commercial', 'cust_industrial',
    'num_strike_trees', 'buffered_tree_counts', 'exclusive_tree_counts'
]

df_filtered = span_vri_prob_merge_df[columns_to_keep]

columns_to_impute = ['num_strike_trees', 'buffered_tree_counts', 'exclusive_tree_counts']
for col in columns_to_impute:
    min_val, max_val = df_filtered[col].min(), df_filtered[col].max()
    df_filtered[col] = df_filtered[col].apply(
        lambda x: np.random.randint(min_val, max_val + 1) if pd.isna(x) else x
    )


important_features = ['vri_numeric', 'elevation', 'longitude']
X_important = df_filtered[important_features]
y = df_filtered['probability (%)']

imputer = SimpleImputer(strategy="mean")
X_important_imputed = imputer.fit_transform(X_important)

X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(X_important_imputed, y, test_size=0.2, random_state=42)

rf_model_imp = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf_model_imp.fit(X_train_imp, y_train_imp)

y_pred_imp = rf_model_imp.predict(X_test_imp)

results_imp = {
    "MAE": mean_absolute_error(y_test_imp, y_pred_imp),
    "MSE": mean_squared_error(y_test_imp, y_pred_imp),
    "R² Score": r2_score(y_test_imp, y_pred_imp)
}

threshold = y_train_imp.median()  
y_test_class = (y_test_imp >= threshold).astype(int)
y_pred_class = (y_pred_imp >= threshold).astype(int)

accuracy = accuracy_score(y_test_class, y_pred_class)

results_imp["Accuracy Score"] = accuracy

print("Model Performance with Selected Features:")
for metric, value in results_imp.items():
    print(f"{metric}: {value:.4f}")


Model Performance with Selected Features:
MAE: 0.0639
MSE: 0.1120
R² Score: 0.9954
Accuracy Score: 0.9116
