## Synthetic Dataset Creation

In [1]:
import pandas as pd

# Read data from csv files
csv_stops = pd.read_csv("gtfs/stops.txt", sep=',', dtype='unicode')
csv_stop_times = pd.read_csv("gtfs/stop_times.txt", sep=',', dtype='unicode')
csv_trips = pd.read_csv("gtfs/trips.txt", sep=',', dtype='unicode')
csv_stopout = pd.read_csv("stops_11aprile.out.csv", sep=';', dtype='unicode')
csv_gtfs_stops = pd.read_csv("gtfs_pt_stops.add_filtered.csv", sep=';', dtype='unicode')

# File merging
csv_stop_times_trips = csv_stop_times.merge(csv_trips, on=['vehicle_id'])
csv_stoptimes_stops = csv_stop_times.merge(csv_stops, on=["stop_id"])
csv_stoptimes_stops2 = csv_stoptimes_stops[['vehicle_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_name', 'stop_sequence', 'stop_lat', 'stop_lon']]
csv_stopout.rename(columns={'stopinfo_busStop': 'busStop_id'}, inplace=True)
csv_stopout.rename(columns={'stopinfo_id': 'vehicle_id'}, inplace=True)
csv_stopout2 = csv_stopout[['busStop_id', 'stopinfo_delay', 'stopinfo_ended', 'vehicle_id', 'stopinfo_initialPersons', 'stopinfo_loadedPersons', 'stopinfo_started', 'stopinfo_type', 'stopinfo_unloadedPersons']]
csv_stops_stopout2 = csv_gtfs_stops.merge(csv_stopout2, on=["busStop_id"])
csv_stopout3 = csv_stoptimes_stops2.merge(csv_stops_stopout2, on=["vehicle_id", "stop_name"])
routes = pd.read_csv("gtfs/trips.txt", sep=',', dtype='unicode')
csv_stopout4 = csv_stopout3.merge(routes, on=["vehicle_id"])

# Output
csv_stopout4.sort_values(["vehicle_id"], ascending=True)
csv_stopout4.to_csv(r'dataset_sintetico_11aprile_prova.csv', index = False, sep=';')

# Read synthetic dataset
data = pd.read_csv("dataset_sintetico_11aprile_prova.csv", sep=';', dtype='unicode')

# Arrival and departure time correction
data['arrival_time'] = data['arrival_time'].str.replace('^24:', '00:', regex=True)
data['arrival_time'] = data['arrival_time'].str.replace('^25:', '01:', regex=True)
data['arrival_time'] = data['arrival_time'].str.replace('^26:', '02:', regex=True)
data['arrival_time'] = data['arrival_time'].str.replace('^27:', '03:', regex=True)
data['arrival_time'] = data['arrival_time'].str.replace('^28:', '04:', regex=True)

data['departure_time'] = data['departure_time'].str.replace('^24:', '00:', regex=True)
data['departure_time'] = data['departure_time'].str.replace('^25:', '01:', regex=True)
data['departure_time'] = data['departure_time'].str.replace('^26:', '02:', regex=True)
data['departure_time'] = data['departure_time'].str.replace('^27:', '03:', regex=True)
data['departure_time'] = data['departure_time'].str.replace('^28:', '04:', regex=True)

data['stopinfo_started'] = data['stopinfo_started'].str.replace('^1:00:', '00:', regex=True)

# Add date
data['arrival_time'] = pd.to_datetime('11/04/2022 ' + data['arrival_time'].str.split('.').str[0], format='%d/%m/%Y %H:%M:%S', errors='coerce')
data['departure_time'] = pd.to_datetime('11/04/2022 ' + data['departure_time'].str.split('.').str[0], format='%d/%m/%Y %H:%M:%S', errors='coerce')
data['stopinfo_started'] = pd.to_datetime('11/04/2022 ' + data['stopinfo_started'].str.split('.').str[0], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# Check for null values after conversion
if data['arrival_time'].isnull().any() or data['departure_time'].isnull().any() or data['stopinfo_started'].isnull().any():
    print("Error: invalid time format")
else:
    print("Conversion successfully completed")

# Rename columns
data.rename(columns={'busStop_id': 'stop_id_SUMO'}, inplace=True)

# Calculate SUMO delay
data['sumo_delay'] = (data['stopinfo_started'] - data['arrival_time']).dt.total_seconds()

# Save the final synthetic dataset
data.to_csv(r'Dataset_definitivo_11aprile_new.csv', index=False, sep=';')

Conversione completata con successo.
