In [33]:
import duckdb
import pandas as pd

# Connect to the database
connection = duckdb.connect("transport_data.db", read_only=False)

In [34]:
date='2024-09-30' #user can enter
weekday=pd.to_datetime(date).weekday()+1

In [35]:
#graph start and end times
starttime=600
endtime=780

In [36]:
#Filter same day of 6 months according to selected date
query = f""" 
SELECT *
FROM services 
WHERE PRODUCT_ID='Zug' 
AND ARRIVAL_PREDICTION_STATUS='REAL' 
AND DEPARTURE_PREDICTION_STATUS='REAL'
AND strftime('%w', OPERATING_DAY) = '{weekday}'"""


df_Zug = connection.execute(query).df()

In [37]:
#Add duplicate_number to make it unique for Stop Id duplication cases in some trips
df_Zug = df_Zug.sort_values('ARRIVAL_PREDICTION')
df_Zug['duplicate_number'] = df_Zug.groupby(['OPERATING_DAY', 'TRIP_IDENTIFIER', 'BPUIC']).cumcount() + 1

In [38]:
#Add  real times of same day of 6 months to as joining selected date data
def merge_predictions(df_Zug, base_date):
    # Tüm unique tarihleri al 
    unique_dates = sorted(df_Zug['OPERATING_DAY'].unique(), reverse=True)
    
    # Base date'i unique_dates'den çıkar
    join_dates = [date for date in unique_dates if date != base_date]
    
    # Base date ile başlangıç dataframe'ini oluştur
    result_df = df_Zug[df_Zug['OPERATING_DAY'] == base_date].copy()
    
    # Base date için kolon isimlerini güncelle
    base_date_str = pd.to_datetime(base_date).strftime('%Y-%m-%d')
    result_df = result_df.rename(columns={
        'DEPARTURE_PREDICTION': f'DEPARTURE_PREDICTION_{base_date_str}',
        'ARRIVAL_PREDICTION': f'ARRIVAL_PREDICTION_{base_date_str}'
    })
    
    # Her tarih için join işlemi
    for date in join_dates:  # base date hariç diğer tarihler için join
        # Geçici dataframe oluştur
        temp_df = df_Zug[df_Zug['OPERATING_DAY'] == date][
            ['TRIP_IDENTIFIER', 'BPUIC', 'DEPARTURE_PREDICTION', 'ARRIVAL_PREDICTION', 'duplicate_number']
        ]
        
        # Sütun isimlerini tarihe göre güncelle
        date_str = pd.to_datetime(date).strftime('%Y-%m-%d')
        temp_df = temp_df.rename(columns={
            'DEPARTURE_PREDICTION': f'DEPARTURE_PREDICTION_{date_str}',
            'ARRIVAL_PREDICTION': f'ARRIVAL_PREDICTION_{date_str}'
        })
        
        # Join işlemi
        result_df = result_df.merge(
            temp_df,
            on=['TRIP_IDENTIFIER', 'BPUIC', 'duplicate_number'],
            how='left'
        )
    
    return result_df

# Kullanım örneği:
base_date = date  # veya istediğiniz tarih
result_df = merge_predictions(df_Zug, base_date)

print(result_df.shape)

(122402, 24)


In [39]:
#Change planned times to minutes
result_df['departure_time']=pd.to_datetime(result_df['DEPARTURE_TIME']).dt.time.apply(lambda x: x.hour * 60 + x.minute)
result_df['arrival_time']=pd.to_datetime(result_df['ARRIVAL_TIME']).dt.time.apply(lambda x: x.hour * 60 + x.minute)

  result_df['departure_time']=pd.to_datetime(result_df['DEPARTURE_TIME']).dt.time.apply(lambda x: x.hour * 60 + x.minute)
  result_df['arrival_time']=pd.to_datetime(result_df['ARRIVAL_TIME']).dt.time.apply(lambda x: x.hour * 60 + x.minute)


In [40]:
#Filter time interval
mask = (result_df['arrival_time'] >= starttime) & (result_df['arrival_time'] <=endtime)
df_filtered = result_df[mask]

In [41]:
#Time to minute function
def convert_time_columns(df):
    # Prediction kolonlarını bul
    prediction_columns = [col for col in df.columns if 'PREDICTION' in col and '_202' in col]
    
    # Her bir prediction kolonu için time dönüşümü uygula
    for col in prediction_columns:
        # Saat ve dakikayı toplam dakikaya çevir
        df[col] = df[col].dt.hour * 60 + df[col].dt.minute
    
    return df

In [42]:
#Run for all real time columns
df_filtered = convert_time_columns(df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].dt.hour * 60 + df[col].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].dt.hour * 60 + df[col].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].dt.hour * 60 + df[col].dt.minute
A value is trying to be set on a copy of a slice fro

In [43]:
#Graph creater function
from collections import defaultdict
import json
import re
import numpy as np

class TupleEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, tuple):
            return {'__tuple': True, 'items': obj}
        if isinstance(obj, (np.int64, np.int32)):
            return int(obj)
        return super(TupleEncoder, self).default(obj)

def get_prediction_columns(df):

    prediction_columns = []
    for col in df.columns:
        if 'PREDICTION' in col and '_202' in col:
            prediction_columns.append(col)
    
    paired_columns = []
    for i in range(0, len(prediction_columns), 2):
        if i + 1 < len(prediction_columns):
            dep_col = prediction_columns[i]
            arr_col = prediction_columns[i + 1]
            paired_columns.append((dep_col, arr_col))
    
    return paired_columns

def generate_dynamic_route_graph(data, start_stop, start_time, target_stop):
    graph = defaultdict(list)
    
    prediction_pairs = get_prediction_columns(data)
    
    def get_predictions_for_transition(from_stop_data, to_stop_data):
        predictions = []
        for dep_col, arr_col in prediction_pairs:
            departure = from_stop_data[dep_col]
            arrival = to_stop_data[arr_col]
            predictions.append((
                int(departure) if pd.notna(departure) else "NaN",
                int(arrival) if pd.notna(arrival) else "NaN"
            ))
        return predictions
    
    def build_line_path(stop, arrival_time, trip_identifier, prev_departure=None):
        next_stops = data[
            (data['TRIP_IDENTIFIER'] == trip_identifier) &
            (data['PLANNED_ARRIVAL'] > arrival_time)
        ].sort_values('PLANNED_ARRIVAL')
        
        transfers = data[
            (data['STOP_ID'] == stop) &
            (data['PLANNED_ARRIVAL'] > arrival_time) &
            (data['TRIP_IDENTIFIER'] != trip_identifier)
        ].sort_values('PLANNED_ARRIVAL')
        
        if not next_stops.empty:
            next_stop_data = next_stops.iloc[0]
            next_stop = next_stop_data['STOP_ID']
            next_arrival = int(next_stop_data['PLANNED_ARRIVAL'])
            
            current_stop_data = data[
                (data['STOP_ID'] == stop) & 
                (data['TRIP_IDENTIFIER'] == trip_identifier) &
                (data['PLANNED_ARRIVAL'] == arrival_time)
            ].iloc[0]
            
            current_departure = int(current_stop_data['PLANNED_DEPARTURE'])
            
            predictions = get_predictions_for_transition(current_stop_data, next_stop_data)
            
            transition = {
                "from": stop,
                "planned_departure": current_departure,
                "to": next_stop,
                "planned_arrival": next_arrival,
                "trip_id": trip_identifier,
                "actual_times": predictions
            }
            
            if transition not in graph[stop]:
                graph[stop].append(transition)
                build_line_path(next_stop, next_arrival, trip_identifier, current_departure)
        
        for _, transfer in transfers.iterrows():
            new_trip_identifier = transfer['TRIP_IDENTIFIER']
            new_arrival = int(transfer['PLANNED_ARRIVAL'])
            new_departure = int(transfer['PLANNED_DEPARTURE'])
            
            predictions = get_predictions_for_transition(transfer, transfer)
            
            transition = {
                "from": stop,
                "planned_departure": new_departure,
                "to": stop,
                "planned_arrival": new_arrival,
                "trip_id": new_trip_identifier,
                "actual_times": predictions
            }
            
            if transition not in graph[stop]:
                graph[stop].append(transition)
                build_line_path(stop, new_arrival, new_trip_identifier)
    
    initial_routes = data[
        (data['STOP_ID'] == start_stop) & 
        (data['PLANNED_ARRIVAL'] >= start_time)
    ].sort_values('PLANNED_ARRIVAL')
    
    if not initial_routes.empty:
        initial_route = initial_routes.iloc[0]
        initial_trip_identifier = initial_route['TRIP_IDENTIFIER']
        initial_arrival = int(initial_route['PLANNED_ARRIVAL'])
        
        build_line_path(start_stop, initial_arrival, initial_trip_identifier)
    
    for stop in graph:
        graph[stop] = [transition for transition in graph[stop] 
                      if transition['to'] != stop]
    
    formatted_graph = "Graph:\n"
    for stop in sorted(graph.keys()):
        if graph[stop]:
            output = []
            for transition in graph[stop]:
                transition_copy = transition.copy()
                transition_copy['actual_times'] = [
                    f"({dep}, {arr})" for dep, arr in transition['actual_times']
                ]
                output.append(transition_copy)
            
            json_str = json.dumps(output, indent=2, cls=TupleEncoder)
            json_str = json_str.replace('"(', '(').replace(')"', ')')
            formatted_graph += f"'{stop}': {json_str}\n"

    return formatted_graph, graph

In [44]:
#Zurih HB to Bern
data = df_filtered.rename(columns={
    'BPUIC': 'STOP_ID',
    'arrival_time': 'PLANNED_ARRIVAL',
    'departure_time': 'PLANNED_DEPARTURE'
}).sort_values(['TRIP_IDENTIFIER','PLANNED_ARRIVAL'])

# Test
start_stop = 8503000
start_time = 610
target_stop = 8501008
formatted_graph, graph = generate_dynamic_route_graph(data, start_stop, start_time, target_stop)


In [53]:
data

Unnamed: 0,OPERATING_DAY,TRIP_IDENTIFIER,OPERATOR_ID,OPERATOR_ABK,OPERATOR_NAME,PRODUCT_ID,LINE_ID,LINE_TEXT,CYCLE_ID,TRANSPORT_MODE_TEXT,...,ARRIVAL_PREDICTION_STATUS,DEPARTURE_TIME,DEPARTURE_PREDICTION_2024-09-30_x,DEPARTURE_PREDICTION_STATUS,THROUGH_TF,duplicate_number,DEPARTURE_PREDICTION_2024-09-30_y,ARRIVAL_PREDICTION_2024-09-30_y,PLANNED_DEPARTURE,PLANNED_ARRIVAL
32920,2024-09-30,5219-850065-8506290-100500,85:850065,THURBO,THURBO Deutschland,Zug,1,RE1,,Zug,...,REAL,30.09.2024 10:15,616,REAL,False,1,616,614,615,614
36513,2024-09-30,5219-850065-8506290-100500,85:850065,THURBO,THURBO Deutschland,Zug,1,RE1,,Zug,...,REAL,30.09.2024 10:47,648,REAL,False,1,648,648,647,647
39279,2024-09-30,5221-850065-8506290-110500,85:850065,THURBO,THURBO Deutschland,Zug,1,RE1,,Zug,...,REAL,30.09.2024 11:15,676,REAL,False,1,676,674,675,674
42761,2024-09-30,5221-850065-8506290-110500,85:850065,THURBO,THURBO Deutschland,Zug,1,RE1,,Zug,...,REAL,30.09.2024 11:47,707,REAL,False,1,707,706,707,707
46024,2024-09-30,5223-850065-8506290-120500,85:850065,THURBO,THURBO Deutschland,Zug,1,RE1,,Zug,...,REAL,30.09.2024 12:15,737,REAL,False,1,737,735,735,734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32272,2024-09-30,ch:1:sjyid:100064:3082-002,85:86,ZB,Zentralbahn,Zug,3082,IR,,IR,...,REAL,30.09.2024 10:08,609,REAL,False,1,609,608,608,608
32747,2024-09-30,ch:1:sjyid:100064:3082-002,85:86,ZB,Zentralbahn,Zug,3082,IR,,IR,...,REAL,30.09.2024 10:13,612,REAL,False,1,612,612,613,613
44458,2024-09-30,ch:1:sjyid:100064:3085-001,85:86,ZB,Zentralbahn,Zug,3085,IR,,IR,...,REAL,30.09.2024 12:03,723,REAL,False,1,723,721,723,720
47445,2024-09-30,ch:1:sjyid:100064:3085-001,85:86,ZB,Zentralbahn,Zug,3085,IR,,IR,...,REAL,30.09.2024 12:30,750,REAL,False,1,750,747,750,749


In [51]:
data.shape

(19760, 26)

In [47]:
print(len(graph))

598


In [45]:
#output
graph_lines = formatted_graph.split('\n')
#for i, line in enumerate(graph_lines[:500]):  # İlk 10 satırı göster
    #print(line)

In [54]:
print(graph)

defaultdict(<class 'list'>, {8503000: [{'from': 8503000, 'planned_departure': 611, 'to': 8503003, 'planned_arrival': 614, 'trip_id': 'ch:1:sjyid:100001:18737-001', 'actual_times': [(611, 615), (612, 614)]}, {'from': 8503000, 'planned_departure': 709, 'to': 8503020, 'planned_arrival': 711, 'trip_id': 'ch:1:sjyid:100001:18742-001', 'actual_times': [(708, 713), (710, 712)]}, {'from': 8503000, 'planned_departure': 757, 'to': 8503011, 'planned_arrival': 759, 'trip_id': 'ch:1:sjyid:100001:18847-001', 'actual_times': [(756, 761), (758, 760)]}, {'from': 8503000, 'planned_departure': 775, 'to': 8503006, 'planned_arrival': 779, 'trip_id': 'ch:1:sjyid:100001:18846-001', 'actual_times': [(772, 780), (775, 779)]}, {'from': 8503000, 'planned_departure': 759, 'to': 8503003, 'planned_arrival': 761, 'trip_id': 'ch:1:sjyid:100001:19547-001', 'actual_times': [(761, 765), (762, 764)]}, {'from': 8503000, 'planned_departure': 769, 'to': 8503020, 'planned_arrival': 771, 'trip_id': 'ch:1:sjyid:100001:18746-00