# Notebook for getting connections dataframe


In [7]:
import pyspark.sql.functions as F
import math
import pandas as pd

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Start Spark

In [2]:
%%configure
{"conf": {
    "spark.app.name": "datavirus_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7326,application_1589299642358_1822,pyspark,idle,Link,Link,
7398,application_1589299642358_1894,pyspark,idle,Link,Link,
7433,application_1589299642358_1929,pyspark,idle,Link,Link,
7443,application_1589299642358_1939,pyspark,idle,Link,Link,
7444,application_1589299642358_1940,pyspark,busy,Link,Link,
7446,application_1589299642358_1942,pyspark,idle,Link,Link,
7447,application_1589299642358_1943,pyspark,idle,Link,Link,
7449,application_1589299642358_1945,pyspark,idle,Link,Link,
7450,application_1589299642358_1946,pyspark,idle,Link,Link,
7452,application_1589299642358_1948,pyspark,idle,Link,Link,


In [3]:
# Initialization
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7466,application_1589299642358_1962,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f905196e710>

In [3]:
%%info

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6663,application_1589299642358_1152,pyspark,idle,Link,Link,
6668,application_1589299642358_1157,pyspark,idle,Link,Link,
6670,application_1589299642358_1159,pyspark,idle,Link,Link,
6671,application_1589299642358_1160,pyspark,idle,Link,Link,
6672,application_1589299642358_1161,pyspark,idle,Link,Link,
6673,application_1589299642358_1162,pyspark,idle,Link,Link,✔


In [4]:
spark.catalog.clearCache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Compute probabilities and average delays using all possible data

In [5]:
# Load SBB data only for stations within 15 km of Zurich
sbb = spark.read.orc('/data/sbb/orc/istdaten')
ids = spark.read.csv('../data/zurich_stations_ids.csv')
sbb_zurich = sbb.join(ids,sbb['BPUIC']==ids['_c0']).drop("_c0") #.where(F.col('BETRIEBSTAG')=='15.05.2019') 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Get a simpler dataframe with delays for each trip and without duplicated rows
delays_df = (
    sbb_zurich 
    .withColumn('arrival_time', F.when(F.col('ankunftszeit') == '', None).otherwise(F.col('ankunftszeit')))
    .withColumn('arrival', F.unix_timestamp(F.col('ankunftszeit'), "dd.MM.yyyy HH:mm").cast('long'))
    .withColumn('real_arrival', F.unix_timestamp(F.col('an_prognose'), "dd.MM.yyyy HH:mm:ss").cast("long"))
    .withColumn('arrival_delay',F.col('real_arrival')-F.col('arrival'))
    .withColumn('departure_time', F.when(F.col('abfahrtszeit') == '', None).otherwise(F.col('abfahrtszeit'))) 
    .withColumn('departure', F.unix_timestamp(F.col('abfahrtszeit'), "dd.MM.yyyy HH:mm").cast('long'))
    .withColumn('real_departure', F.unix_timestamp(F.col('ab_prognose'), "dd.MM.yyyy HH:mm:ss").cast("long"))
    .withColumn('departure_delay',F.col('real_departure')-F.col('departure'))
    .select('arrival_time','departure_time','arrival_delay','departure_delay',
            'BETRIEBSTAG','FAHRT_BEZEICHNER','LINIEN_ID','PRODUKT_ID','BPUIC','HALTESTELLEN_NAME')
    .toDF('Arrival_Time','Departure_Time','Arrival_Delay','Departure_Delay',
          'Day','Trip_ID','Line_ID', 'Type','Station_ID', 'Station_Name')
    .dropDuplicates()
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
# Remove rows correspoding to trips appearing only once on the dataframe
# These removable trips actually come from / go to stations outside Zurich
ids = delays_df.groupBy('Trip_ID','Day').count()
ids = ids.where(ids['count']>1).select('Trip_ID').distinct()
df = delays_df.join(ids, "Trip_ID").orderBy('Trip_ID','Arrival_Time','Departure_Time')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
# Solve null values: Remove columns with more than two null-values
# and Copy "Arrival_Time" value if "Departure_Time" is null and viceversa
df = df.dropna(thresh=1,subset=('Arrival_Time','Departure_Time')) \
        .withColumn("Departure",F.coalesce(df.Departure_Time,df.Arrival_Time))\
        .withColumn("Arrival", F.coalesce(df.Arrival_Time,df.Departure_Time))\
        .drop("Departure_Time","Arrival_Time")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
#Compute total number of connections (trip and station)
df_total_trips = df.groupBy('Trip_ID','Station_Name')\
                   .count().toDF('Trip_ID','Station_Name','Total')

#Compute number of delayed connections (trip and station)
df_delayed_trips = df.where(df['Arrival_Delay']>0).groupBy('Trip_ID','Station_Name')\
                    .count().toDF('Trip_ID','Station_Name','Delayed')

#Compute average delay for every connection (trip and station)
df_mean_delays = df.where(df['Arrival_Delay']>0).groupBy('Trip_ID','Station_Name')\
                .agg(F.mean('Arrival_Delay').alias("Mean_Delay"))

# Join dataframes
df_prob = df_total_trips.join(df_delayed_trips, on = ['Trip_ID','Station_Name'])\
            .withColumn("Probability", F.col("Delayed")/F.col("Total")).drop("Delayed","Total")

df_prob_and_delays = df_prob.join(df_mean_delays, on = ['Trip_ID','Station_Name'])

df_final = df.join(df_prob_and_delays,on =['Trip_ID','Station_Name']).drop('Departure_Delay','Arrival_Delay')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Create final dataframe showing delay probabilities for each connection (in a normal workday)

In [12]:
#Filter data to trips happening only a single working day between 5h and 21h 
df_day =df_final.where(F.col('Day')=='15.05.2019') 
df_min_hour = df_day.where(F.hour(F.unix_timestamp(F.col('Arrival'), "dd.MM.yyyy HH:mm").cast('timestamp'))>=5) 
df_max_hour = df_min_hour.where(F.hour(F.unix_timestamp(F.col('Departure'), "dd.MM.yyyy HH:mm").cast('timestamp'))<=20)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
# Rank stops by departure_time for every trip and day 
from pyspark.sql import Window
trip_window = Window.partitionBy('Trip_ID','Day').orderBy(F.asc('Departure'))
trip_rank = F.rank().over(trip_window).alias('stop')
begin = df_max_hour.select('*', trip_rank).alias('begin').orderBy('Trip_ID','Arrival','Departure').fillna(0)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
# Create dataframe for every connection 
end = begin.drop('Departure').withColumn('stop', begin.stop -1).alias('end')
data = begin.drop('Arrival','Mean_Delay','Probability').join(end, on=['stop','Day','Trip_ID','Type','Line_ID'])\
            .orderBy('Trip_ID','Arrival','Departure').drop('stop')\
            .toDF('Day','Trip_ID','Type','Line_ID','Start_Station','Start_ID','Start_Time',
                  'Stop_Station','Stop_ID','Stop_Time','Propability','Mean_Delay')\
            

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- Day: string (nullable = true)
 |-- Trip_ID: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Line_ID: string (nullable = true)
 |-- Start_Station: string (nullable = true)
 |-- Start_ID: string (nullable = true)
 |-- Start_Time: string (nullable = true)
 |-- Stop_Station: string (nullable = true)
 |-- Stop_ID: string (nullable = true)
 |-- Stop_Time: string (nullable = true)
 |-- Propability: double (nullable = false)
 |-- Mean_Delay: double (nullable = false)

### Save final dataframe in CSV file

In [None]:
%%spark -o df_stations -n -1

In [18]:
%%local
data.to_csv("../data/Zurich_TransportConnections_WORKINGDAY.csv", index=False)

### Use stop-times table (for what?)

In [None]:
#trips_df = spark.read.orc("hdfs:///data/sbb/timetables/orc/trips")
#calendar_df = spark.read.orc("hdfs:///data/sbb/timetables/orc/calendar")
#routes_df = spark.read.orc("hdfs:///data/sbb/timetables/orc/routes")
stop_times_df = spark.read.orc("hdfs:///data/sbb/timetables/orc/stop_times")
stop_times_df.show(5)

In [None]:
relevant_times = stop_times_df.join(ids,stop_times_df['stop_id']==ids['_c0'])\
                .select("stop_sequence","stop_id","arrival_time","departure_time")\
                .orderBy('stop_id','arrival_time','departure_time','stop_sequence').dropDuplicates()
relevant_times.show(5)