In [5]:
import duckdb
import pandas as pd

# Connect to the database
connection = duckdb.connect("transport_data.db", read_only=False)

In [7]:
date='2024-10-01' #user can enter
weekday=pd.to_datetime(date).weekday()+1

In [None]:
#graph start and end times
starttime=600
endtime=780

In [8]:
#Filter same day of 6 months according to selected date
query = f""" 
SELECT *
FROM services 
WHERE PRODUCT_ID='Zug' 
AND ARRIVAL_PREDICTION_STATUS='REAL' 
AND DEPARTURE_PREDICTION_STATUS='REAL'
AND strftime('%w', OPERATING_DAY) = '{weekday}'"""


df_Zug = connection.execute(query).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [9]:
#Add duplicate_number to make it unique for Stop Id duplication cases in some trips
df_Zug = df_Zug.sort_values('ARRIVAL_PREDICTION')
df_Zug['duplicate_number'] = df_Zug.groupby(['OPERATING_DAY', 'TRIP_IDENTIFIER', 'BPUIC']).cumcount() + 1

In [10]:
#Add  real times of same day of 6 months to as joining selected date data
def merge_predictions(df_Zug, base_date):
    # Tüm unique tarihleri al 
    unique_dates = sorted(df_Zug['OPERATING_DAY'].unique(), reverse=True)
    
    # Base date'i unique_dates'den çıkar
    join_dates = [date for date in unique_dates if date != base_date]
    
    # Base date ile başlangıç dataframe'ini oluştur
    result_df = df_Zug[df_Zug['OPERATING_DAY'] == base_date].copy()
    
    # Base date için kolon isimlerini güncelle
    base_date_str = pd.to_datetime(base_date).strftime('%Y-%m-%d')
    result_df = result_df.rename(columns={
        'DEPARTURE_PREDICTION': f'DEPARTURE_PREDICTION_{base_date_str}',
        'ARRIVAL_PREDICTION': f'ARRIVAL_PREDICTION_{base_date_str}'
    })
    
    # Her tarih için join işlemi
    for date in join_dates:  # base date hariç diğer tarihler için join
        # Geçici dataframe oluştur
        temp_df = df_Zug[df_Zug['OPERATING_DAY'] == date][
            ['TRIP_IDENTIFIER', 'BPUIC', 'DEPARTURE_PREDICTION', 'ARRIVAL_PREDICTION', 'duplicate_number']
        ]
        
        # Sütun isimlerini tarihe göre güncelle
        date_str = pd.to_datetime(date).strftime('%Y-%m-%d')
        temp_df = temp_df.rename(columns={
            'DEPARTURE_PREDICTION': f'DEPARTURE_PREDICTION_{date_str}',
            'ARRIVAL_PREDICTION': f'ARRIVAL_PREDICTION_{date_str}'
        })
        
        # Join işlemi
        result_df = result_df.merge(
            temp_df,
            on=['TRIP_IDENTIFIER', 'BPUIC', 'duplicate_number'],
            how='left'
        )
    
    return result_df

# Kullanım örneği:
base_date = date  # veya istediğiniz tarih
result_df = merge_predictions(df_Zug, base_date)

print(result_df.shape)

(123062, 74)


In [11]:
#Change planned times to minutes
result_df['departure_time']=pd.to_datetime(result_df['DEPARTURE_TIME']).dt.time.apply(lambda x: x.hour * 60 + x.minute)
result_df['arrival_time']=pd.to_datetime(result_df['ARRIVAL_TIME']).dt.time.apply(lambda x: x.hour * 60 + x.minute)

In [14]:
#Filter time interval
mask = (result_df['arrival_time'] >= starttime) & (result_df['arrival_time'] <=endtime)
df_filtered = result_df[mask]

In [15]:
#Time to minute function
def convert_time_columns(df):
    # Prediction kolonlarını bul
    prediction_columns = [col for col in df.columns if 'PREDICTION' in col and '_202' in col]
    
    # Her bir prediction kolonu için time dönüşümü uygula
    for col in prediction_columns:
        # Saat ve dakikayı toplam dakikaya çevir
        df[col] = df[col].dt.hour * 60 + df[col].dt.minute
    
    return df

In [16]:
#Run for all real time columns
df_filtered = convert_time_columns(df_filtered)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].dt.hour * 60 + df[col].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].dt.hour * 60 + df[col].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].dt.hour * 60 + df[col].dt.minute
A value is trying to be set on a copy of a slice fro

In [17]:
#Graph creater function
from collections import defaultdict
import json
import re
import numpy as np

class TupleEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, tuple):
            return {'__tuple': True, 'items': obj}
        if isinstance(obj, (np.int64, np.int32)):
            return int(obj)
        return super(TupleEncoder, self).default(obj)

def get_prediction_columns(df):

    prediction_columns = []
    for col in df.columns:
        if 'PREDICTION' in col and '_202' in col:
            prediction_columns.append(col)
    
    paired_columns = []
    for i in range(0, len(prediction_columns), 2):
        if i + 1 < len(prediction_columns):
            dep_col = prediction_columns[i]
            arr_col = prediction_columns[i + 1]
            paired_columns.append((dep_col, arr_col))
    
    return paired_columns

def generate_dynamic_route_graph(data, start_stop, start_time, target_stop):
    graph = defaultdict(list)
    
    prediction_pairs = get_prediction_columns(data)
    
    def get_predictions_for_transition(from_stop_data, to_stop_data):
        predictions = []
        for dep_col, arr_col in prediction_pairs:
            departure = from_stop_data[dep_col]
            arrival = to_stop_data[arr_col]
            predictions.append((
                int(departure) if pd.notna(departure) else "NaN",
                int(arrival) if pd.notna(arrival) else "NaN"
            ))
        return predictions
    
    def build_line_path(stop, arrival_time, trip_identifier, prev_departure=None):
        next_stops = data[
            (data['TRIP_IDENTIFIER'] == trip_identifier) &
            (data['PLANNED_ARRIVAL'] > arrival_time)
        ].sort_values('PLANNED_ARRIVAL')
        
        transfers = data[
            (data['STOP_ID'] == stop) &
            (data['PLANNED_ARRIVAL'] > arrival_time) &
            (data['TRIP_IDENTIFIER'] != trip_identifier)
        ].sort_values('PLANNED_ARRIVAL')
        
        if not next_stops.empty:
            next_stop_data = next_stops.iloc[0]
            next_stop = next_stop_data['STOP_ID']
            next_arrival = int(next_stop_data['PLANNED_ARRIVAL'])
            
            current_stop_data = data[
                (data['STOP_ID'] == stop) & 
                (data['TRIP_IDENTIFIER'] == trip_identifier) &
                (data['PLANNED_ARRIVAL'] == arrival_time)
            ].iloc[0]
            
            current_departure = int(current_stop_data['PLANNED_DEPARTURE'])
            
            predictions = get_predictions_for_transition(current_stop_data, next_stop_data)
            
            transition = {
                "from": stop,
                "planned_departure": current_departure,
                "to": next_stop,
                "planned_arrival": next_arrival,
                "trip_id": trip_identifier,
                "actual_times": predictions
            }
            
            if transition not in graph[stop]:
                graph[stop].append(transition)
                build_line_path(next_stop, next_arrival, trip_identifier, current_departure)
        
        for _, transfer in transfers.iterrows():
            new_trip_identifier = transfer['TRIP_IDENTIFIER']
            new_arrival = int(transfer['PLANNED_ARRIVAL'])
            new_departure = int(transfer['PLANNED_DEPARTURE'])
            
            predictions = get_predictions_for_transition(transfer, transfer)
            
            transition = {
                "from": stop,
                "planned_departure": new_departure,
                "to": stop,
                "planned_arrival": new_arrival,
                "trip_id": new_trip_identifier,
                "actual_times": predictions
            }
            
            if transition not in graph[stop]:
                graph[stop].append(transition)
                build_line_path(stop, new_arrival, new_trip_identifier)
    
    initial_routes = data[
        (data['STOP_ID'] == start_stop) & 
        (data['PLANNED_ARRIVAL'] >= start_time)
    ].sort_values('PLANNED_ARRIVAL')
    
    if not initial_routes.empty:
        initial_route = initial_routes.iloc[0]
        initial_trip_identifier = initial_route['TRIP_IDENTIFIER']
        initial_arrival = int(initial_route['PLANNED_ARRIVAL'])
        
        build_line_path(start_stop, initial_arrival, initial_trip_identifier)
    
    for stop in graph:
        graph[stop] = [transition for transition in graph[stop] 
                      if transition['to'] != stop]
    
    formatted_graph = "Graph:\n"
    for stop in sorted(graph.keys()):
        if graph[stop]:
            output = []
            for transition in graph[stop]:
                transition_copy = transition.copy()
                transition_copy['actual_times'] = [
                    f"({dep}, {arr})" for dep, arr in transition['actual_times']
                ]
                output.append(transition_copy)
            
            json_str = json.dumps(output, indent=2, cls=TupleEncoder)
            json_str = json_str.replace('"(', '(').replace(')"', ')')
            formatted_graph += f"'{stop}': {json_str}\n"
    
    return formatted_graph

In [22]:
#Zurih HB to Bern
data = df_filtered.rename(columns={
    'BPUIC': 'STOP_ID',
    'arrival_time': 'PLANNED_ARRIVAL',
    'departure_time': 'PLANNED_DEPARTURE'
}).sort_values(['TRIP_IDENTIFIER','PLANNED_ARRIVAL'])

# Test
start_stop = 8503000
start_time = 610
target_stop = 8501008
graph = generate_dynamic_route_graph(data, start_stop, start_time, target_stop)


In [23]:
#output
graph_lines = graph.split('\n')
for i, line in enumerate(graph_lines[:500]):  # İlk 10 satırı göster
    print(line)

Graph:
'8500010': [
  {
    "from": 8500010,
    "planned_departure": 757,
    "to": 8500136,
    "planned_arrival": 760,
    "trip_id": "ch:1:sjyid:100001:17342-001",
    "actual_times": [
      (747, 760),
      (758, 761),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (NaN, 760),
      (NaN, 760),
      (NaN, 760),
      (NaN, 760),
      (NaN, 760),
      (757, 759),
      (757, 760),
      (758, 761),
      (761, 764),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760),
      (757, 760)
    ]
  },
  {
    "from": 8500010,
    "planned_departure": 751,
    "to": 8500020,
    "planned_arrival": 757,
    "trip_id": "ch:1:sjyid:100001:17343-001",
    "actual_times": [
      (746, 758),
      (751, 758),
      (751, 758),
      (754, 763),
      (751, 758),
      (751, 757),
      (751, 757),
      (751, 757),
