In [1]:
import os
import glob
import math
import pickle

import numpy as np
import pandas as pd
import geopandas as gpd
import torch
from collections import defaultdict

import processing_io as pio
from torch_geometric.transforms import LineGraph

from torch_geometric.data import Data, Batch
import shapely.wkt as wkt
from tqdm import tqdm
import fiona
import os

import alphashape
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
from shapely.geometry import Point
import random

# result_df_name = 'sim_output_1pm_capacity_reduction_10k_PRELIMINARY'
result_df_name = 'sim_output_1pm_capacity_reduction_10k_REALLY'
result_path = '../../../../data/datasets_simulation_outputs/' + result_df_name
string_is_for_1pm = "pop_1pm"

base_dir_sample_sim_input = '../../../../data/' + string_is_for_1pm + '_simulations/' + string_is_for_1pm + '_policies_combinations_with_normal_dist/'
subdirs_pattern = os.path.join(base_dir_sample_sim_input, 'output_networks_*')
subdirs = list(set(glob.glob(subdirs_pattern)))
subdirs.sort()

gdf_basecase_output_links = gpd.read_file('results/' + string_is_for_1pm + '_basecase_average_output_links.geojson')
gdf_basecase_average_mode_stats = pd.read_csv('results/' + string_is_for_1pm + '_basecase_average_mode_stats.csv', delimiter=';')
districts = gpd.read_file("../../../../data/visualisation/districts_paris.geojson")

## Abstract

This is the current notebook (9.10.2024) that processes the output of the simulations for further usage by GNN.

It is different from process_output_of_simulations_with_all_output_links_and_eqasim_info.ipynb, as it also includes more input information.


## Process results

Process the outputs of the simulations for further usage by GNN.

In [2]:
# Read all network data into a dictionary of GeoDataFrames
def compute_result_dic():
    result_dic_output_links = {}
    result_dic_eqasim_trips = {}
    base_network_no_policies = gdf_basecase_output_links
    result_dic_output_links["base_network_no_policies"] = base_network_no_policies
    # counter = 0
    for subdir in tqdm(subdirs, desc="Processing subdirs", unit="subdir"):
        # counter += 1
        # if counter > 5:
        #     break
        # print(f'Accessing folder: {subdir}')
        # print(len(os.listdir(subdir)))
        networks = [network for network in os.listdir(subdir) if not network.endswith(".DS_Store")]
        for network in networks:
            file_path = os.path.join(subdir, network)
            policy_key = pio.create_policy_key_1pm(network)
            df_output_links = pio.read_output_links(file_path)
            df_eqasim_trips = pio.read_eqasim_trips(file_path)
            if (df_output_links is not None and df_eqasim_trips is not None):
                df_output_links.drop(columns=['geometry'], inplace=True)
                gdf_extended = pio.extend_geodataframe(gdf_base=gdf_basecase_output_links, gdf_to_extend=df_output_links, column_to_extend='highway', new_column_name='highway')
                gdf_extended = pio.extend_geodataframe(gdf_base=gdf_basecase_output_links, gdf_to_extend=gdf_extended, column_to_extend='vol_car', new_column_name='vol_car_base_case')
                result_dic_output_links[policy_key] = gdf_extended
                mode_stats = pio.calculate_averaged_results(df_eqasim_trips)
                result_dic_eqasim_trips[policy_key] = mode_stats
    return result_dic_output_links, result_dic_eqasim_trips

result_dic_output_links, result_dic_eqasim_trips = compute_result_dic()
print(len(result_dic_eqasim_trips.keys()))
print(len(result_dic_output_links.keys()))

base_gdf = result_dic_output_links["base_network_no_policies"]
links_gdf_base = gpd.GeoDataFrame(base_gdf, geometry='geometry')
links_gdf_base.crs = "EPSG:2154"  # Assuming the original CRS is EPSG:2154
links_gdf_base.to_crs("EPSG:4326", inplace=True)
districts['district_centroid'] = districts['geometry'].centroid
links_gdf_with_districts = gpd.sjoin(links_gdf_base, districts, how='left', op='intersects')

# Group by edge and aggregate the district names
links_gdf_with_districts = links_gdf_with_districts.groupby('link').agg({
    'from_node': 'first',
    'to_node': 'first',
    'length': 'first',
    'freespeed': 'first',
    'capacity': 'first',
    'lanes': 'first',
    'modes': 'first',
    'vol_car': 'first',
    'highway': 'first',
    'geometry': 'first',
    'osm:way:oneway': 'first',
    'c_ar': lambda x: list(x.dropna()),
    'district_centroid': lambda x: list(x.dropna()),
    'perimetre': lambda x: list(x.dropna()),
    'surface': lambda x: list(x.dropna()),
}).reset_index()

Processing subdirs:  39%|███▉      | 39/100 [06:16<10:10, 10.00s/subdir]

empty data error../../../../data/pop_1pm_simulations/pop_1pm_policies_combinations_with_normal_dist/output_networks_4400/network_d_3_8_9_13_17_19/output_links.csv.gz
empty data error../../../../data/pop_1pm_simulations/pop_1pm_policies_combinations_with_normal_dist/output_networks_4400/network_d_3_8_9_13_17_19/eqasim_trips.csv


Processing subdirs: 100%|██████████| 100/100 [15:38<00:00,  9.38s/subdir]


9912
9913



  districts['district_centroid'] = districts['geometry'].centroid
  if await self.run_code(code, result, async_=asy):
  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():


In [12]:
result_dic_eqasim_trips['Policy introduced in Arrondissement(s) d, 1, 10']

Unnamed: 0,mode,total_travel_time,total_routed_distance
0,bike,1167.557692,3621.016292
1,car,991.253319,4845.071564
2,car_passenger,425.329424,4389.150233
3,outside,0.789321,1058.270423
4,pt,1609.697548,5478.875497
5,walk,1017.318739,1221.339026


In [3]:
def find_duplicate_edges_in_gdf(gdf):
    edge_count = defaultdict(list)
    for idx, row in gdf.iterrows():
        # Keep the edge direction by using the tuple without sorting
        edge = (row['from_node'], row['to_node'])
        edge_count[edge].append(idx)
    
    # Filter to include only edges that appear more than once
    duplicates = {edge: indices for edge, indices in edge_count.items() if len(indices) > 1}
    return duplicates

def summarize_duplicate_edges(gdf):
    if 'vol_car' not in gdf.columns:
        print("'vol_car' column does not exist in the dataframe")
        return gdf

    gdf['edge_id'] = gdf.apply(lambda row: (row['from_node'], row['to_node']), axis=1)
    grouped = gdf.groupby('edge_id')
    
    def aggregate_edges(group):
        non_zero_vol = group[group['vol_car'] != 0]
        if len(non_zero_vol) > 1:
            # If there are multiple non-zero entries, take the one with the highest vol_car
            combined = non_zero_vol.loc[non_zero_vol['vol_car'].idxmax()].copy()
        elif not non_zero_vol.empty:
            combined = non_zero_vol.iloc[0].copy()
        else:
            combined = group.iloc[0].copy()
        
        # We're no longer summing vol_car, just keeping the value from the selected row
        combined['original_directions'] = list(group[['from_node', 'to_node']].itertuples(index=False, name=None))
        return combined
    
    summarized_gdf = grouped.apply(aggregate_edges)
    summarized_gdf = summarized_gdf.reset_index(drop=True)
    summarized_gdf = summarized_gdf.drop(columns=['edge_id'])
    return summarized_gdf

links = gpd.GeoDataFrame(links_gdf_with_districts, geometry='geometry', crs=links_gdf_base.crs)
links = links.rename(columns={'c_ar': 'district', 'perimetre': 'district_perimeter', 'surface': 'district_surface'})

remaining_duplicates = find_duplicate_edges_in_gdf(links)
print(f"Number of duplicate edges: {len(remaining_duplicates)}")

links_without_duplicates = summarize_duplicate_edges(links)
print(f"Number of remaining duplicate edges after summarizing: {len(find_duplicate_edges_in_gdf(links_without_duplicates))}")

Number of duplicate edges: 74
Number of remaining duplicate edges after summarizing: 0


In [4]:
def identify_summarized_entries_detailed(original_gdf, summarized_gdf):
    original_gdf['edge_id'] = original_gdf['from_node'].astype(str) + '_' + original_gdf['to_node'].astype(str)
    summarized_gdf['edge_id'] = summarized_gdf['from_node'].astype(str) + '_' + summarized_gdf['to_node'].astype(str)
    
    original_counts = original_gdf['edge_id'].value_counts()
    summarized_entries = summarized_gdf[summarized_gdf['edge_id'].isin(original_counts[original_counts > 1].index)]
    
    detailed_entries = []
    both_zero = []
    both_nonzero = []
    one_zero_one_nonzero = []
    
    for _, summarized_row in summarized_entries.iterrows():
        edge_id = summarized_row['edge_id']
        original_rows = original_gdf[original_gdf['edge_id'] == edge_id]
        
        vol_car_values = original_rows['vol_car'].values
        entry = {
            'summarized': summarized_row,
            'original': original_rows,
            'count': len(original_rows)
        }
        
        if all(vol_car == 0 for vol_car in vol_car_values):
            both_zero.append(entry)
        elif all(vol_car != 0 for vol_car in vol_car_values):
            both_nonzero.append(entry)
        else:
            one_zero_one_nonzero.append(entry)
        
        detailed_entries.append(entry)
    
    return detailed_entries, both_zero, both_nonzero, one_zero_one_nonzero

detailed_entries, both_zero, both_nonzero, one_zero_one_nonzero = identify_summarized_entries_detailed(links, links_without_duplicates)

print(f"Number of summarized entries: {len(detailed_entries)}")
print(f"Entries where both 'vol_car' are zero: {len(both_zero)}")
print(f"Entries where both 'vol_car' are non-zero: {len(both_nonzero)}")
print(f"Entries where one 'vol_car' is zero and one is non-zero: {len(one_zero_one_nonzero)}")

def print_examples(category, entries, num_examples=2):
    print(f"\n{category} (showing {min(num_examples, len(entries))} examples):")
    for i, entry in enumerate(entries[:num_examples]):
        print(f"\nExample {i+1}:")
        print("Summarized row:")
        print(entry['summarized'])
        print("\nOriginal rows:")
        print(entry['original'])
        print(f"Count: {entry['count']}")
        print("-" * 50)

print_examples("Both 'vol_car' are zero", both_zero)
print_examples("Both 'vol_car' are non-zero", both_nonzero)
print_examples("One 'vol_car' is zero and one is non-zero", one_zero_one_nonzero)

Number of summarized entries: 74
Entries where both 'vol_car' are zero: 25
Entries where both 'vol_car' are non-zero: 9
Entries where one 'vol_car' is zero and one is non-zero: 40

Both 'vol_car' are zero (showing 2 examples):

Example 1:
Summarized row:
link                                                              412904
from_node                                                      112017915
to_node                                                        112017915
length                                                         39.819261
freespeed                                                       8.333333
capacity                                                           480.0
lanes                                                                1.0
modes                                                  car,car_passenger
vol_car                                                              0.0
highway                                                      residential
geometry       

In [5]:
len(links_without_duplicates)

31140

In [6]:
# PROCESS LINK GEOMETRIES
edge_start_point_tensor, stacked_edge_geometries_tensor, district_centroids_tensor_padded, edges_base, nodes = pio.get_link_geometries(links_without_duplicates, districts)

# Analyze results and save to file

In [7]:
# pio.analyze_geodataframes(result_dic=result_dic, consider_only_highway_edges=True)

In [8]:
# pio.analyze_geodataframes(result_dic=result_dic, consider_only_highway_edges=False)

In [11]:
def process_result_dic(result_dic, result_dic_mode_stats, save_path=None, batch_size=500, gdf_input=None):
    os.makedirs(save_path, exist_ok=True)
    datalist = []
    linegraph_transformation = LineGraph()
    
    vol_base_case = links_without_duplicates['vol_car'].values
    capacity_base_case = np.where(links_without_duplicates['modes'].str.contains('car'), links_without_duplicates['capacity'], 0)
    length = links_without_duplicates['length'].values
    freespeed_base_case = links_without_duplicates['freespeed'].values
    allowed_modes = pio.encode_modes(links_without_duplicates)
    edge_index = torch.tensor(edges_base, dtype=torch.long).t().contiguous()
    x = torch.zeros((len(nodes), 1), dtype=torch.float)
    data = Data(edge_index=edge_index, x=x)
    
    batch_counter = 0
    for key, df in tqdm(result_dic.items(), desc="Processing result_dic", unit="dataframe"):   
        if isinstance(df, pd.DataFrame) and key != "base_network_no_policies":
            gdf = pio.prepare_gdf(df, gdf_input)
            len_edges = len(gdf)            
            capacities_new, capacity_reduction, highway, freespeed =  pio.get_basic_edge_attributes(capacity_base_case, gdf)
            district_info, combined_tensor = pio.compute_combined_tensor(vol_base_case, capacity_base_case, length, freespeed_base_case, allowed_modes, gdf, capacities_new, capacity_reduction, highway, freespeed)
            
            linegraph_data = linegraph_transformation(data)
            linegraph_data.x = combined_tensor
            linegraph_data.num_nodes = combined_tensor.shape[0]
        
            # add edge attributes: 1 if edge to district, 0 if edge to edge
            edge_to_district_index, edge_to_district_attr = pio.compute_edge_attributes(district_info, linegraph_data, len_edges, gdf)
            if linegraph_data.edge_attr is None:
                linegraph_data.edge_attr = torch.zeros((linegraph_data.edge_index.shape[1] - edge_to_district_index.shape[1], 1), dtype=torch.long)
            linegraph_data.edge_attr = torch.cat([linegraph_data.edge_attr, edge_to_district_attr], dim=0)

            # add node attributes: 1 if district, 0 if street
            node_type_feature = pio.compute_node_attributes(district_info, len_edges)
            linegraph_data.x = torch.cat([linegraph_data.x, node_type_feature], dim=1)
            linegraph_data.pos = torch.cat([stacked_edge_geometries_tensor, district_centroids_tensor_padded], dim=0)
            linegraph_data.y = pio.compute_target_tensor(vol_base_case, gdf, district_info)
                        
            df_mode_stats = result_dic_mode_stats.get(key)
            if df_mode_stats is not None:
                numeric_cols = df_mode_stats.select_dtypes(include=[np.number]).columns
                mode_stats_numeric = df_mode_stats[numeric_cols].astype(float)
                mode_stats_tensor = torch.tensor(mode_stats_numeric.values, dtype=torch.float)
                linegraph_data.mode_stats = mode_stats_tensor
            
            if linegraph_data.validate(raise_on_error=True):
                datalist.append(linegraph_data)
                batch_counter += 1

                # Save intermediate result every batch_size data points
                if batch_counter % batch_size == 0:
                    batch_index = batch_counter // batch_size
                    torch.save(datalist, os.path.join(save_path, f'datalist_batch_{batch_index}.pt'))
                    datalist = []  # Reset datalist for the next batch
            else:
                print("Invalid line graph data")
    
    # Save any remaining data points
    if datalist:
        batch_index = (batch_counter // batch_size) + 1
        torch.save(datalist, os.path.join(save_path, f'datalist_batch_{batch_index}.pt'))
    return

# Call the function
# process_result_dic(result_dic=result_dic_output_links, result_dic_mode_stats=result_dic_eqasim_trips, save_path=result_path, batch_size=100, gdf_input=links_without_duplicates)

6


Processing result_dic: 100%|██████████| 9913/9913 [17:22:52<00:00,  6.31s/dataframe]     
