In [None]:
# Predict high risk areas 

# Import libraries

import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from shapely.geometry import Point
from geopandas.tools import sjoin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_curve, auc, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN
from sklearn.utils.class_weight import compute_class_weight
import shap
import os

base_path = "/Users/leonardo/Desktop/Tesi/LTSBikePlan/images"
city_name = "Trento"

# Create the path for the new folder
city_folder_path = os.path.join(base_path, city_name)

# Load
hex_totalscore = gpd.read_file('../data/hex_totalscore.shp')
centrality_measures = pd.read_csv('../data/network_centrality_measures.csv')
accidents = gpd.read_file('../data/accidents_trento.geojson')
hex_totalscore.drop(columns=['index'], inplace=True)
# Set 'geometry' as the geometry column
hex_totalscore = hex_totalscore.set_geometry('geometry')
hex_totalscore = hex_totalscore.set_crs("EPSG:25832", allow_override=True) 

# Convert 'coordinates' column to Shapely Point objects
def create_point(row):
    x, y = map(float, row['coordinates'].strip('()').split(', '))
    return Point(x, y)
centrality_measures['geometry'] = centrality_measures.apply(create_point, axis=1)
centrality_gdf = gpd.GeoDataFrame(centrality_measures, geometry='geometry')
centrality_gdf = centrality_gdf.set_crs("EPSG:25832", allow_override=True) 

# Filter accidents for years >= 2018
accidents_filtered = accidents[accidents['anno'] >= 2018].copy()
accidents_filtered['anno'] = accidents_filtered['anno'].astype(int)
accidents_simplified = gpd.GeoDataFrame(accidents_filtered[['anno', 'geometry']])

joined = gpd.sjoin(hex_totalscore, accidents_simplified, how='left', predicate='contains')
accident_counts = joined.groupby(joined.index).size()
accident_counts_df = pd.DataFrame(accident_counts, columns=['accidents_count'])
hex_totalscore_with_accidents = hex_totalscore.merge(accident_counts_df, left_index=True, right_index=True, how='left')
hex_totalscore_with_accidents['accidents_count'] = hex_totalscore_with_accidents['accidents_count'].fillna(0).astype(int)

#print(hex_totalscore_with_accidents['accidents_count'].value_counts())

# This joins the centrality data to the hexagons they fall within
joined_with_centrality = gpd.sjoin(hex_totalscore_with_accidents, centrality_gdf, how='left', predicate='contains')

# Group by the hexagon index and calculate means
centrality_means = joined_with_centrality.groupby(joined_with_centrality.index).agg({
    'degree_centrality': 'mean',
    'betweenness_centrality': 'mean',
    'closeness_centrality': 'mean'
})

# Rename columns
centrality_means.columns = ['degree_centrality_mean', 'betweenness_centrality_mean', 'closeness_centrality_mean']

# Merge the centrality means back into the original hex_totalscore_with_accidents DataFrame
hex_totalscore_with_centrality = hex_totalscore_with_accidents.merge(centrality_means, left_index=True, right_index=True, how='left')

# Fill NaN values with 0 (assuming no centrality data in those hexagons)
hex_totalscore_with_centrality.fillna(0, inplace=True)

df = hex_totalscore_with_centrality.copy()

In [None]:
# Assuming 'accidents_count' is the target variable. 
# You might want to categorize this into 'high-risk' (1) and 'low-risk' (0) based on a threshold.

# Define the threshold for high-risk and low-risk categorization
percentile_threshold = 85  # Top 15%
threshold = df['accidents_count'].quantile(percentile_threshold / 100)  
df['risk_category'] = (df['accidents_count'] > threshold).astype(int)

# Selecting features for the model
features = ['total_scor', 'degree_centrality_mean', 'betweenness_centrality_mean', 'closeness_centrality_mean']
X = df[features]
y = df['risk_category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Addressing Data Imbalance with Borderline-SMOTE
smote_methods = {
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'BorderlineSMOTE': BorderlineSMOTE(random_state=42),
    'SMOTEENN': SMOTEENN(random_state=42)  
}

# Adjusting class weights for models
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
weights = {i: class_weights[i] for i in range(len(class_weights))}

# Parameter grid for GridSearchCV
param_grid = {
    'logreg': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    'rfc': {
        'n_estimators': [10, 50, 100, 200],
        'max_features': ['sqrt', 'log2']
    },
    'svm': {
        'C': [0.01, 0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly']
    }
}

# Initialize models with balanced class weights
models = {
    'logreg': LogisticRegression(class_weight=weights),
    'rfc': RandomForestClassifier(class_weight=weights),
    'svm': SVC(class_weight=weights, probability=True)
}

# Applying GridSearchCV for hyperparameter tuning with cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for smote_key in smote_methods.keys():
    X_train_smote, y_train_smote = smote_methods[smote_key].fit_resample(X_train, y_train)
    print(f"Using {smote_key} resampling technique:")
    for model_key in models.keys():
        grid_search = GridSearchCV(models[model_key], param_grid[model_key], cv=skf, scoring='roc_auc')
        grid_search.fit(X_train_smote, y_train_smote)
        models[model_key] = grid_search.best_estimator_
        print(f"Best parameters for {model_key}: {grid_search.best_params_}")

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    precision, recall, _ = precision_recall_curve(y_test, model.predict_proba(X_test)[:, 1])
    auc_score = auc(recall, precision)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print("Precision-Recall AUC:", auc_score)
    print("ROC-AUC Score:", roc_auc)

# Evaluate models
for model_key in models.keys():
    print(f"Evaluating {model_key}:")
    evaluate_model(models[model_key], X_test, y_test)

Given the differences in precision and recall across models, the choice of the best model may depend on what is more critical for your application: reducing false positives or false negatives. If it's more important to not miss high-risk areas, logreg might be preferable despite its lower precision. If a balance is needed, rfc seems a better choice.

In [None]:
#Retrain on the full dataset

# Re-running the Random Forest Classifier on the full dataset
best_params_rfc = {'max_features': 'sqrt', 'n_estimators': 100}  # Use the best parameters from GridSearchCV

# Initialize the Random Forest model with the best parameters
rfc_model = RandomForestClassifier(max_features=best_params_rfc['max_features'], 
                                   n_estimators=best_params_rfc['n_estimators'],
                                   class_weight=weights, random_state=42)

# Since you are running the model on the full dataset, use the SMOTE method that performed best
smote_method = SMOTE(random_state=42)  # Replace with the SMOTE method that worked best for you
X_train_smote, y_train_smote = smote_method.fit_resample(X_train, y_train)

# Fit the model
rfc_model.fit(X_train_smote, y_train_smote)

# Predictions and Evaluation
predictions = rfc_model.predict(X_test)
print(classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
precision, recall, _ = precision_recall_curve(y_test, rfc_model.predict_proba(X_test)[:, 1])
auc_score = auc(recall, precision)
roc_auc = roc_auc_score(y_test, rfc_model.predict_proba(X_test)[:, 1])
print("Precision-Recall AUC:", auc_score)
print("ROC-AUC Score:", roc_auc)

# Using SHAP for model interpretation
explainer = shap.TreeExplainer(rfc_model)
shap_values = explainer.shap_values(X_train_smote)

# Plot summary
shap.summary_plot(shap_values, X_train_smote, feature_names=features)

In [None]:
# Predict the risk categories for the entire dataset
df['predicted_risk_category'] = rfc_model.predict(scaler.transform(df[features]))

# Merging the predictions back into the original geospatial dataset
hex_totalscore_with_predictions = hex_totalscore.merge(df[['predicted_risk_category']], left_index=True, right_index=True, how='left')

# Fill NaN values for areas that were not included in the prediction (if any)
hex_totalscore_with_predictions['predicted_risk_category'].fillna(0, inplace=True)

hex_totalscore_with_predictions = hex_totalscore_with_predictions.to_crs(epsg=4326)


import folium

# Convert your geodataframe to GeoJSON format
hex_geojson = hex_totalscore_with_predictions.to_json()

# Create a base map centered around Trento
m = folium.Map(location=[46.0667, 11.1333], zoom_start=12) 

# Add the GeoJSON overlay to the map
folium.GeoJson(
    hex_geojson,
    style_function=lambda feature: {
        'fillColor': 'red' if feature['properties']['predicted_risk_category'] == 1 else 'green',
        'color': 'black',
        'weight': 1,
        'dashArray': '5, 5',
        'fillOpacity': 0.5,
    }
).add_to(m)

file_path = os.path.join(city_folder_path, 'risk_accidents_hexagon.html')

# Assuming 'accident_map' is a Folium Map object
m.save(file_path)

# Display the map
m

In [None]:
# 2) Spatial autocorrelation between gaps and accidents
import json
import pandas as pd
import geopandas as gpd
import networkx as nx
from shapely.geometry import Point, LineString
import matplotlib.pyplot as plt
from esda.moran import Moran_Local
import libpysal as lps
import osmnx as ox
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from geopandas.tools import sjoin_nearest
from libpysal.weights import Queen
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from folium.map import FeatureGroup

# Step 1: Load the Data

with open('../data/filtered_gaps.json', 'r') as file:
    data_read = json.load(file)

# Convert the keys back to tuples (if necessary)
filtered_gaps = {tuple(map(int, k[1:-1].split(', '))): v for k, v in data_read.items()}

filepath = "/Users/leonardo/Desktop/Tesi/LTSBikePlan/data/Trento_lts.graphml"
G_lts = ox.load_graphml(filepath)
G_lts = ox.project_graph(G_lts, to_crs='EPSG:4326')
isolated_nodes = list(nx.isolates(G_lts))
G_lts.remove_nodes_from(isolated_nodes)

edges = ox.graph_to_gdfs(G_lts, nodes=False)

high_stress_lines = []
for key, path in filtered_gaps.items():
    points = []
    for node in path:
        # Check if the node exists in the graph and has the required data
        if node in G_lts.nodes and 'x' in G_lts.nodes[node] and 'y' in G_lts.nodes[node]:
            x = G_lts.nodes[node]['x']
            y = G_lts.nodes[node]['y']
            points.append(Point(x, y))
        else:
            print(f"Node {node} data is missing or incomplete.")
    if points:
        line = LineString(points)
        high_stress_lines.append(line)

high_stress_gdf = gpd.GeoDataFrame(geometry=high_stress_lines, crs='EPSG:4326')
high_stress_gdf = high_stress_gdf.to_crs('EPSG:25832')
accidents_near_lines = sjoin_nearest(accidents_simplified, high_stress_gdf, how='left')
accidents_per_line = accidents_near_lines.groupby('index_right').size()

lats = [G_lts.nodes[node]['y'] for node in G_lts.nodes]
lngs = [G_lts.nodes[node]['x'] for node in G_lts.nodes]

center_lat = sum(lats) / len(lats)
center_lng = sum(lngs) / len(lngs)

# Create a spatial weights matrix
# w = Queen.from_dataframe(high_stress_gdf, use_index=True)

# Identify islands
# islands = w.islands

# # Remove islands from the weights matrix and data
# high_stress_gdf = high_stress_gdf.drop(islands)
# accidents_per_line = accidents_per_line.drop(islands)

#Recompute the spatial weights matrix
w = Queen.from_dataframe(high_stress_gdf, use_index=True)
w.transform = 'r'

# Ensure that the index of accidents_per_line matches high_stress_gdf
accidents_per_line = accidents_per_line.reindex(high_stress_gdf.index, fill_value=0)
# Identify Disconnected Components
components = w.component_labels
unique_components = pd.unique(components)
results = []
for comp in unique_components:
    comp_indexes = high_stress_gdf.index[components == comp]
    comp_high_stress_gdf = high_stress_gdf.loc[comp_indexes]
    comp_data = accidents_per_line.loc[comp_indexes]
    if len(comp_data) > 2 and comp_data.var() != 0:
        comp_weights = Queen.from_dataframe(comp_high_stress_gdf, use_index=True)
        comp_weights.transform = 'r'
        local_moran_comp = Moran_Local(comp_data, comp_weights)

        for idx, (p_value, z_score, quadrant) in enumerate(zip(local_moran_comp.p_sim, local_moran_comp.z_sim, local_moran_comp.q)):
            results.append({'Component': comp, 'Segment': comp_indexes[idx], 'P-value': p_value, 'Z-score': z_score, 'Quadrant': quadrant})
    else:
        print(f"Component {comp} skipped due to insufficient data points or no variation.")

results_df = pd.DataFrame(results)

high_stress_gdf = high_stress_gdf.merge(results_df, left_on=high_stress_gdf.index, right_on='Segment')
high_stress_gdf = high_stress_gdf.to_crs('EPSG:4326')

# Quadrants colors
quadrant_colors = {
    1: 'darkred',
    2: 'yellow',
    3: 'orange',
    4: 'red'
}

m = folium.Map(location=[center_lat, center_lng], zoom_start=13)

quadrant_1 = FeatureGroup(name='Quadrant 1: High-high').add_to(m)
quadrant_2 = FeatureGroup(name='Quadrant 2: Low-low').add_to(m)
quadrant_3 = FeatureGroup(name='Quadrant 3: Low-high').add_to(m)
quadrant_4 = FeatureGroup(name='Quadrant 4: High-low').add_to(m)

for _, row in high_stress_gdf.iterrows():
    color = quadrant_colors.get(row['Quadrant'], 'blue')
    line = folium.PolyLine(locations=[(y, x) for x, y in row['geometry'].coords], color=color, weight=3)
    if row['Quadrant'] == 1:
        line.add_to(quadrant_1)
    elif row['Quadrant'] == 2:
        line.add_to(quadrant_2)
    elif row['Quadrant'] == 3:
        line.add_to(quadrant_3)
    elif row['Quadrant'] == 4:
        line.add_to(quadrant_4)

quadrant_1.add_to(m)
quadrant_2.add_to(m)
quadrant_3.add_to(m)
quadrant_4.add_to(m)

legend_html = '''
<div style="position: fixed; 
     top: 10px; left: 50px; width: 230px; height: 120px; 
     border:2px solid grey; z-index:9999; font-size:14px; background: white;
     padding: 5px;">
     <b>Quadrants - Risk of Accidents</b> <br>
     &nbsp; Quadrant 1: <i style="background:darkred;width:12px;height:12px;display:inline-block;"></i> High-high <br>
     &nbsp; Quadrant 2: <i style="background:yellow;width:12px;height:12px;display:inline-block;"></i> Low-low <br>
     &nbsp; Quadrant 3: <i style="background:orange;width:12px;height:12px;display:inline-block;"></i> Low-high <br>
     &nbsp; Quadrant 4: <i style="background:red;width:12px;height:12px;display:inline-block;"></i> High-low
</div>
'''

from branca.element import Element

legend = Element(legend_html)
m.get_root().html.add_child(legend)


folium.LayerControl(collapsed=False).add_to(m)

file_path = os.path.join(city_folder_path, 'gap_quadrants.html')

# Assuming 'accident_map' is a Folium Map object
m.save(file_path)

m


The output of the code provides a detailed analysis of the spatial autocorrelation of accidents along different segments of the road network (referred to as 'high-stress lines') using Local Moran's I (LISA). Here's a breakdown of what the output signifies:

### Understanding the Output for Each Segment
- **P-value**: Indicates the probability that the observed spatial pattern (in this case, the concentration of accidents on a particular road segment) could have occurred by random chance. Lower p-values (typically below 0.05) suggest that the pattern is unlikely to be random and is statistically significant.
- **Z-score**: Measures how many standard deviations an element is from the mean. A high positive or negative Z-score indicates a more pronounced clustering than expected under spatial randomness.
- **Quadrant**: Indicates the type of spatial correlation:
    - **Quadrant 1 (High-High)**: The segment has a high number of accidents and is surrounded by segments with high numbers of accidents.
    - **Quadrant 2 (Low-Low)**: The segment has a low number of accidents and is surrounded by segments with low numbers of accidents.
    - **Quadrant 3 (Low-High)**: The segment has a low number of accidents but is surrounded by segments with high numbers of accidents.
    - **Quadrant 4 (High-Low)**: The segment has a high number of accidents but is surrounded by segments with low numbers of accidents.

### Interpreting Results

Significant Clusters of Accidents: Several segments have low p-values and high z-scores in Quadrants 1 and 3, indicating significant clusters of accidents. These could be areas of particular concern for road safety.
- **High-High Clusters (Quadrant 1):** Segments in this quadrant suggest areas where accidents are consistently high, and neighboring segments also have high accident rates. These might be hotspots needing targeted interventions.
- **Low-Low Clusters (Quadrant 3):** These segments indicate areas with fewer accidents, surrounded by areas with similarly low accident rates. These areas might be considered safer or less prone to accidents.
- **High-Low and Low-High Clusters (Quadrants 2 and 4):** These segments indicate spatial outliers where the accident rate significantly differs from neighboring areas. They could point to unique local factors affecting road safety.

### Overall Interpretation
- The segments with low P-values and high absolute Z-scores indicate areas where the accident distribution is not random but shows a significant spatial pattern.
- Quadrants help identify the nature of these patterns, whether they are clusters of high accident areas or isolated high accident segments surrounded by lower accident segments.

### Additional Observations
- **Disconnected Components**: The warning about disconnected components and islands in the weights matrix suggests that certain segments of the network do not connect to others, which could affect the global spatial autocorrelation analysis. These disconnected components might need separate analysis or consideration.
- **Islands**: These are segments without neighboring segments in the spatial weights matrix, likely due to the way the road network is structured or due to data processing steps.