In [None]:
import geopandas as gpd
import pandas as pd
from datetime import date
import os
import numpy as np


SHARED_PROJECT_PATH = '...'

In [None]:
today = date.today().strftime('%d-%m-%Y')
gdf_graph_edges = gpd.read_file(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}.json'), driver='GeoJSON')

In [None]:
# strings to numerical values
gdf_graph_edges['maxspeed'] = gdf_graph_edges['maxspeed'].astype(float)

In [None]:
# add normalized accident numbers by segment length for later sampling -> acc_no_per_m
gdf_graph_edges['mean_acc_no_per_m'] = gdf_graph_edges[gdf_graph_edges.columns[gdf_graph_edges.columns.str.contains('acc_no')]].apply(lambda col: col / gdf_graph_edges['length']).mean(axis=1)

In [None]:
# define dataset configuration file
df_dataset_conf = pd.DataFrame()

# include min-max parameters of acc_no normalization into config file
df_dataset_conf['acc_no_norm_min'] = pd.Series(gdf_graph_edges[gdf_graph_edges.filter(regex='acc_no_\d').columns].min().min())
df_dataset_conf['acc_no_norm_max'] = pd.Series(gdf_graph_edges[gdf_graph_edges.filter(regex='acc_no_\d').columns].max().max())
for feat in ['maxspeed', 'length', 'nearby_station_no', 'repr_lat', 'repr_lon']:
    df_dataset_conf[f'{feat}_min'] = gdf_graph_edges[feat].min()
    df_dataset_conf[f'{feat}_max'] = gdf_graph_edges[feat].max()

# min-max normalization of numerical values
gdf_graph_edges.loc[:, ['maxspeed', 'length', 'nearby_station_no', 'repr_lat', 'repr_lon']] = (gdf_graph_edges[['maxspeed', 'length', 'nearby_station_no', 'repr_lat', 'repr_lon']]-gdf_graph_edges[['maxspeed', 'length', 'nearby_station_no', 'repr_lat', 'repr_lon']].min())/\
                                                                                              (gdf_graph_edges[['maxspeed', 'length', 'nearby_station_no', 'repr_lat', 'repr_lon']].max()-gdf_graph_edges[['maxspeed', 'length', 'nearby_station_no', 'repr_lat', 'repr_lon']].min())
gdf_graph_edges.loc[:, gdf_graph_edges.filter(regex='acc_no_\d').columns] = (gdf_graph_edges[gdf_graph_edges.filter(regex='acc_no_').columns]-gdf_graph_edges[gdf_graph_edges.filter(regex='acc_no_').columns].min().min())/\
                                                                      (gdf_graph_edges[gdf_graph_edges.filter(regex='acc_no_').columns].max().max()-gdf_graph_edges[gdf_graph_edges.filter(regex='acc_no_').columns].min().min())

In [None]:
# one-hot encoding categorical variables
gdf_graph_edges = pd.concat([gdf_graph_edges, pd.get_dummies(gdf_graph_edges['lit'], columns='lit', prefix='lit')], axis=1)
gdf_graph_edges = pd.concat([gdf_graph_edges, pd.get_dummies(gdf_graph_edges['highway'], columns='highway', prefix='highway')], axis=1)
gdf_graph_edges = pd.concat([gdf_graph_edges, pd.get_dummies(gdf_graph_edges['surface'], columns='surface', prefix='surface')], axis=1)
gdf_graph_edges.drop(columns=['lit', 'highway', 'surface'], inplace=True)

In [None]:
# boolean columns to float
gdf_graph_edges[gdf_graph_edges.select_dtypes('bool').columns] = gdf_graph_edges[gdf_graph_edges.select_dtypes('bool').columns].astype(float)

In [None]:
# undersampling - remove outliers - most road segments with small and large accidents per meter
ids_outl = gdf_graph_edges[(gdf_graph_edges['mean_acc_no_per_m'] < 0.05) | (gdf_graph_edges['mean_acc_no_per_m'] > 0.4)].index.to_numpy()
nr_non_outl = gdf_graph_edges.shape[0] - len(ids_outl)
np.random.seed(42)
np.random.shuffle(ids_outl)
ids_to_remove = ids_outl[:int(round(len(ids_outl)-nr_non_outl, 0))]
gdf_graph_edges_resmpl = gdf_graph_edges.drop(ids_to_remove)
gdf_graph_edges_resmpl['mean_acc_no_per_m'].hist(log=True)
gdf_graph_edges_resmpl.drop(columns='mean_acc_no_per_m', inplace=True)
gdf_graph_edges.drop(columns='mean_acc_no_per_m', inplace=True)

In [None]:
# save dataset configuration file to CSV
df_dataset_conf.to_csv(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_config.csv'), index=False)

In [None]:
# save NN input data to geojson
gdf_graph_edges.to_file(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_NN.json'), driver="GeoJSON")

In [None]:
# 9-1 train-test split dataset and save to CSVs
gdf_graph_edges.drop(columns=['gnx_edge_id', 'geometry'], inplace=True)
gdf_graph_edges.to_csv(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_NN_all.csv'), index=False)
train, test = np.split(gdf_graph_edges.sample(frac=1, random_state=42), [int(.9*len(gdf_graph_edges))])
train.to_csv(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_NN_train.csv'), index=False)
test.to_csv(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_NN_test.csv'), index=False)

In [None]:
# save resampled NN input data to geojson
gdf_graph_edges_resmpl.to_file(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_NN-resmpl.json'), driver="GeoJSON")

In [None]:
# 9-1 train-test split resampled dataset and save to CSVs
gdf_graph_edges_resmpl.drop(columns=['gnx_edge_id', 'geometry'], inplace=True)
gdf_graph_edges_resmpl.to_csv(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_NN-resmpl_all.csv'), index=False)
train, test = np.split(gdf_graph_edges_resmpl.sample(frac=1, random_state=42), [int(.9*len(gdf_graph_edges_resmpl))])
train.to_csv(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_NN-resmpl_train.csv'), index=False)
test.to_csv(os.path.join(SHARED_PROJECT_PATH, 'data', 'OSM_road_network', today, f'BP_safety-network_{today}_NN-resmpl_test.csv'), index=False)