In [1]:
import duckdb
import pandas as pd

# Connect to the database
connection = duckdb.connect("../data/transport_data.db", read_only=False)


query = f""" CREATE TABLE IF NOT EXISTS services AS SELECT * FROM '../data/delay_data/*.csv'"""
connection.execute(query)

r = connection.sql("DESCRIBE services")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [2]:
# Get the count of all entries
query = f"""
SELECT count(*)
FROM services
WHERE PRODUKT_ID='Zug'
"""
result = connection.sql(query).df()

# Print the result
print("count:")
print(result)



count:
   count_star()
0       4772714


In [13]:
# Get the 20 most delayed trains
query = f"""
SELECT LINIEN_ID, AVG(epoch((AN_PROGNOSE - try_strptime(ANKUNFTSZEIT, '%d.%m.%Y %H:%M'))))/3600 as avg_delay, count(*) as count
FROM services
WHERE PRODUKT_ID='Zug'
AND BETREIBER_NAME='Schweizerische Bundesbahnen SBB'
AND AN_PROGNOSE_STATUS='REAL'
GROUP BY LINIEN_ID
ORDER BY 2 DESC
LIMIT 40
"""
result = connection.sql(query).df()

pd.set_option('display.max_columns', None)
print("20 most delayed trains:")
result

20 most delayed trains:


Unnamed: 0,LINIEN_ID,avg_delay,count
0,17790,1.846713,6
1,17786,1.4425,4
2,466,1.420501,66
3,13491,1.254611,5
4,17788,1.085722,5
5,31216,1.030556,1
6,17784,0.968056,2
7,31438,0.906815,15
8,70196,0.848472,2
9,73290,0.830625,8


In [5]:
# get the Betreiber with name 'Schweizerische Bundesbahnen SBB'
query = f"""
SELECT *
FROM services
WHERE PRODUKT_ID='Zug'
AND BETREIBER_NAME='Schweizerische Bundesbahnen SBB'
"""
result = connection.sql(query).df()
result

Unnamed: 0,BETRIEBSTAG,FAHRT_BEZEICHNER,BETREIBER_ID,BETREIBER_ABK,BETREIBER_NAME,PRODUKT_ID,LINIEN_ID,LINIEN_TEXT,UMLAUF_ID,VERKEHRSMITTEL_TEXT,ZUSATZFAHRT_TF,FAELLT_AUS_TF,BPUIC,HALTESTELLEN_NAME,ANKUNFTSZEIT,AN_PROGNOSE,AN_PROGNOSE_STATUS,ABFAHRTSZEIT,AB_PROGNOSE,AB_PROGNOSE_STATUS,DURCHFAHRT_TF
0,2024-09-01,ch:1:sjyid:100001:1007-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,1007,IC,,IC,False,False,8503424,Schaffhausen,,NaT,,01.09.2024 05:47,2024-09-01 05:47:34,REAL,False
1,2024-09-01,ch:1:sjyid:100001:1007-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,1007,IC,,IC,False,False,8503000,Zürich HB,01.09.2024 06:26,2024-09-01 06:26:05,REAL,,NaT,,False
2,2024-09-01,ch:1:sjyid:100001:1009-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,1009,IC,,IC,False,False,8503424,Schaffhausen,,NaT,,01.09.2024 06:47,2024-09-01 06:47:18,REAL,False
3,2024-09-01,ch:1:sjyid:100001:1009-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,1009,IC,,IC,False,False,8503000,Zürich HB,01.09.2024 07:26,2024-09-01 07:26:36,REAL,,NaT,,False
4,2024-09-01,ch:1:sjyid:100001:1011-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,1011,IC,,IC,False,False,8503424,Schaffhausen,,NaT,,01.09.2024 07:47,2024-09-01 07:47:42,REAL,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895692,2024-09-30,ch:1:sjyid:100001:992-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,992,IC61,,IC,False,False,8507492,Interlaken Ost,,NaT,,30.09.2024 23:00,2024-09-30 23:00:50,REAL,False
1895693,2024-09-30,ch:1:sjyid:100001:992-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,992,IC61,,IC,False,False,8507493,Interlaken West,30.09.2024 23:04,2024-09-30 23:04:20,REAL,30.09.2024 23:05,2024-09-30 23:06:01,REAL,False
1895694,2024-09-30,ch:1:sjyid:100001:992-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,992,IC61,,IC,False,False,8507483,Spiez,30.09.2024 23:22,2024-09-30 23:21:32,REAL,30.09.2024 23:22,2024-09-30 23:22:54,REAL,False
1895695,2024-09-30,ch:1:sjyid:100001:992-001,85:11,SBB,Schweizerische Bundesbahnen SBB,Zug,992,IC61,,IC,False,False,8507100,Thun,30.09.2024 23:32,2024-09-30 23:31:30,REAL,30.09.2024 23:33,2024-09-30 23:33:53,REAL,False


In [18]:
query = f"""
SELECT LINIEN_ID, HALTESTELLEN_NAME, BPUIC
FROM services
WHERE LINIEN_ID IN ['17790']
--AND AN_PROGNOSE_STATUS='REAL'
LIMIT 40
"""
result = connection.sql(query).df()

pd.set_option('display.max_columns', None)
result

Unnamed: 0,LINIEN_ID,HALTESTELLEN_NAME,BPUIC
0,17790,St. Margrethen SG,8506314
1,17790,Winterthur,8506000
2,17790,Zürich HB,8503000
3,17790,St. Margrethen SG,8506314
4,17790,Winterthur,8506000
5,17790,Zürich HB,8503000


In [24]:
stops = [8503000]
query = f"""
SELECT BPUIC, HALTESTELLEN_NAME, LINIEN_ID
FROM services
WHERE BPUIC IN {stops}
AND HALTESTELLEN_NAME NOT NULL
AND PRODUKT_ID='Zug'
LIMIT 40
"""
result = connection.sql(query).df()

pd.set_option('display.max_columns', None)
result

Unnamed: 0,BPUIC,HALTESTELLEN_NAME,LINIEN_ID
0,8503000,Zürich HB,1007
1,8503000,Zürich HB,1009
2,8503000,Zürich HB,1011
3,8503000,Zürich HB,10318
4,8503000,Zürich HB,10320
5,8503000,Zürich HB,10322
6,8503000,Zürich HB,10328
7,8503000,Zürich HB,10334
8,8503000,Zürich HB,10340
9,8503000,Zürich HB,10368


In [29]:

query = f"""
SELECT BPUIC, FIRST(HALTESTELLEN_NAME), LINIEN_ID
FROM services
WHERE LINIEN_ID='1007'
AND HALTESTELLEN_NAME NOT NULL
AND PRODUKT_ID='Zug'
GROUP BY 3, 1
LIMIT 40
"""
result = connection.sql(query).df()

result

Unnamed: 0,BPUIC,"""first""(HALTESTELLEN_NAME)",LINIEN_ID
0,8503400,Bülach,1007
1,8503404,Rafz,1007
2,8503000,Zürich HB,1007
3,8503421,Jestetten,1007
4,8503424,Schaffhausen,1007


In [None]:
# Close the connection
connection.close()