In [1]:
import os
import glob
import math
import pickle

import numpy as np
import pandas as pd
import geopandas as gpd
import torch

import os
import pandas as pd
import geopandas as gpd
import xml.etree.ElementTree as ET
import gzip
import json
import fiona
from collections import defaultdict
from torch_geometric.transforms import LineGraph

from torch_geometric.data import Data, Batch
import gzip
import xml.etree.ElementTree as ET
import pandas as pd

highway_mapping = {
    'trunk': 0, 'trunk_link': 0, 'motorway_link': 0,
    'primary': 1, 'primary_link': 1,
    'secondary': 2, 'secondary_link': 2,
    'tertiary': 3, 'tertiary_link': 3,
    'residential': 4, 'living_street': 5,
    'pedestrian': 6, 'service': 7,
    'construction': 8, 'unclassified': 9,
    'np.nan': -1
}

base_dir = '../../../../data/pop_1pm_simulations/idf_1pm/' 

## Abstract

The idea is that this notebook has to be executed just once. It is then used in understand_simulation_input_data.ipynb.

In [2]:
# Function to parse the XML and create a DataFrame
def parse_facilities(file_path):
    # Decompress and parse the XML
    with gzip.open(file_path, 'rb') as f:
        tree = ET.parse(f)
        root = tree.getroot()

    # List to hold the rows of data
    data = []

    # Loop through each facility
    for facility in root.findall('facility'):
        # Get facility attributes
        facility_id = facility.get('id')
        link_id = facility.get('linkId')
        x_coord = facility.get('x')
        y_coord = facility.get('y')
        
        # Loop through each activity within the facility
        for activity in facility.findall('activity'):
            # Get the activity type
            activity_type = activity.get('type')
            
            # Add a row to the data list
            data.append({
                'facility_id': facility_id,
                'link_id': link_id,
                'x': x_coord,
                'y': y_coord,
                'activity_type': activity_type
            })

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)
    return df

# Path to your .xml.gz file
path_facilities = base_dir + "idf_1pm_facilities.xml.gz"

# Parse the file and create a DataFrame
facilities_df = parse_facilities(path_facilities)

# Show the DataFrame
print(facilities_df)

        facility_id link_id          x           y activity_type
0           edu_100  282093   652608.3   6861929.0     education
1         edu_10001  446367   647403.7   6876773.0     education
2         edu_10006  712030   727470.7   6825852.0     education
3         edu_10012  446110   652276.8   6864355.0     education
4         edu_10014  253210   654721.8   6859894.0     education
...             ...     ...        ...         ...           ...
440221  work_995812   17429  647364.31  6859018.95          work
440222  work_997555   87857  649467.87  6863718.43          work
440223  work_997914  571839  651728.87   6862703.2          work
440224  work_998482  216858  662185.05  6868229.08          work
440225  work_999534  508818  647903.65  6859406.96          work

[440226 rows x 5 columns]


In [3]:
# Path to your .xml.gz file
path_transit_schedule = base_dir + 'idf_1pm_transit_schedule.xml.gz'

# Function to parse the transit schedule and create DataFrames
def parse_transit_schedule(file_path):
    # Decompress and parse the XML
    with gzip.open(file_path, 'rb') as f:
        tree = ET.parse(f)
        root = tree.getroot()

    # Lists to hold the rows of data for stopFacilities and relations
    stop_facility_data = []
    relation_data = []

    # Find the transitStops and minimalTransferTimes sections
    transit_stops = root.find('transitStops')
    minimal_transfer_times = root.find('minimalTransferTimes')

    # Parse stopFacilities within transitStops
    for stop_facility in transit_stops.findall('stopFacility'):
        stop_facility_data.append({
            'id': stop_facility.get('id'),
            'linkRefId': stop_facility.get('linkRefId'),
            'x': stop_facility.get('x'),
            'y': stop_facility.get('y'),
            'name': stop_facility.get('name'),
            'stopAreaId': stop_facility.get('stopAreaId'),
            'isBlocking': stop_facility.get('isBlocking')
        })

    # Parse relations within minimalTransferTimes
    for relation in minimal_transfer_times.findall('relation'):
        relation_data.append({
            'fromStop': relation.get('fromStop'),
            'toStop': relation.get('toStop'),
            'transferTime': relation.get('transferTime')
        })

    # Convert to DataFrames
    stop_facility_df = pd.DataFrame(stop_facility_data)
    relation_df = pd.DataFrame(relation_data)

    return stop_facility_df, relation_df

# Parse the transit schedule and get the DataFrames
stop_facility_df, relation_df = parse_transit_schedule(path_transit_schedule)

# # Show the DataFrames
# print("Stop Facility DataFrame:")
# print(stop_facility_df.head())

# print("\nRelation DataFrame:")
# print(relation_df.head())

stop_facility_df.to_csv('intermediate_results/transit_schedule_stop_facilities.csv', index=False)
relation_df.to_csv('intermediate_results/transit_schedule_relations.csv', index=False)

In [4]:
# Function to parse the population data and create a DataFrame
def parse_population(file_path):
    # Decompress and parse the XML
    with gzip.open(file_path, 'rb') as f:
        tree = ET.parse(f)
        root = tree.getroot()

    # List to hold the rows of data
    population_data = []

    # Parse persons and their attributes within the population file
    for person in root.findall('.//person'):
        person_data = {'id': person.get('id')}
        
        # Parse the person's attributes
        attributes = person.find('attributes')
        if attributes is not None:
            for attribute in attributes.findall('attribute'):
                person_data[attribute.get('name')] = attribute.text

        # Add the row to the list
        population_data.append(person_data)

    # Convert to DataFrame
    population_df = pd.DataFrame(population_data)

    return population_df

# Path to your .xml.gz file
path_population = base_dir + 'idf_1pm_population.xml.gz'

# Parse the population file and get the DataFrame
population_df = parse_population(path_population)

# Save the DataFrame as a CSV file
population_df.to_csv('intermediate_results/population.csv', index=False)