# To Begin With...

### Name your spark application as `GASPAR_final` or `GROUP_NAME_final`.

<div class='alert alert-info'><b>Any application without a proper name would be promptly killed.</b></div>

In [17]:
%%configure
{"conf": {
    "spark.app.name": "datavirus_final"
}}

A session has already been started. If you intend to recreate the session with new configurations, please include the -f argument.


### Start Spark

In [1]:
# Initialization
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
4408,application_1587988164357_1036,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7fa39a90b190>

In [2]:
%%info

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
4395,application_1587988164357_1023,pyspark,idle,Link,Link,
4405,application_1587988164357_1033,pyspark,idle,Link,Link,
4406,application_1587988164357_1034,pyspark,busy,Link,Link,
4407,application_1587988164357_1035,pyspark,idle,Link,Link,
4408,application_1587988164357_1036,pyspark,idle,Link,Link,✔


### Read the [SBB actual data](https://opentransportdata.swiss/en/dataset/istdaten) in ORC format

In [3]:
sbb = spark.read.orc('/data/sbb/orc/istdaten')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
sbb.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- betriebstag: string (nullable = true)
 |-- fahrt_bezeichner: string (nullable = true)
 |-- betreiber_id: string (nullable = true)
 |-- betreiber_abk: string (nullable = true)
 |-- betreiber_name: string (nullable = true)
 |-- produkt_id: string (nullable = true)
 |-- linien_id: string (nullable = true)
 |-- linien_text: string (nullable = true)
 |-- umlauf_id: string (nullable = true)
 |-- verkehrsmittel_text: string (nullable = true)
 |-- zusatzfahrt_tf: string (nullable = true)
 |-- faellt_aus_tf: string (nullable = true)
 |-- bpuic: string (nullable = true)
 |-- haltestellen_name: string (nullable = true)
 |-- ankunftszeit: string (nullable = true)
 |-- an_prognose: string (nullable = true)
 |-- an_prognose_status: string (nullable = true)
 |-- abfahrtszeit: string (nullable = true)
 |-- ab_prognose: string (nullable = true)
 |-- ab_prognose_status: string (nullable = true)
 |-- durchfahrt_tf: string (nullable = true)

In [5]:
sbb.head(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(betriebstag=u'03.09.2018', fahrt_bezeichner=u'80:06____:17010:000', betreiber_id=u'80:06____', betreiber_abk=u'DB', betreiber_name=u'DB Regio AG', produkt_id=u'Zug', linien_id=u'17010', linien_text=u'RE', umlauf_id=u'', verkehrsmittel_text=u'RE', zusatzfahrt_tf=u'false', faellt_aus_tf=u'false', bpuic=u'8500090', haltestellen_name=u'Basel Bad Bf', ankunftszeit=u'', an_prognose=u'', an_prognose_status=u'PROGNOSE', abfahrtszeit=u'03.09.2018 05:45', ab_prognose=u'', ab_prognose_status=u'UNBEKANNT', durchfahrt_tf=u'false'), Row(betriebstag=u'03.09.2018', fahrt_bezeichner=u'80:06____:17012:000', betreiber_id=u'80:06____', betreiber_abk=u'DB', betreiber_name=u'DB Regio AG', produkt_id=u'Zug', linien_id=u'17012', linien_text=u'RE', umlauf_id=u'', verkehrsmittel_text=u'RE', zusatzfahrt_tf=u'false', faellt_aus_tf=u'false', bpuic=u'8500090', haltestellen_name=u'Basel Bad Bf', ankunftszeit=u'', an_prognose=u'', an_prognose_status=u'PROGNOSE', abfahrtszeit=u'03.09.2018 06:34', ab_prognose=u'',

### Read the station list data [BFKOORD_GEO](https://opentransportdata.swiss/en/cookbook/hafas-rohdaten-format-hrdf/#Abgrenzung)

In [4]:
metadata = spark.read.csv('/data/sbb/stations/bfkoordgeo.csv', header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
metadata.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- StationID: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Height: string (nullable = true)
 |-- Remark: string (nullable = true)

In [8]:
metadata.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+---------+------+----------------+
|StationID|Longitude| Latitude|Height|          Remark|
+---------+---------+---------+------+----------------+
|  0000002|26.074412|44.446770|     0|       Bucuresti|
|  0000003| 1.811446|50.901549|     0|          Calais|
|  0000004| 1.075329|51.284212|     0|      Canterbury|
|  0000005|-3.543547|50.729172|     0|          Exeter|
|  0000007| 9.733756|46.922368|   744|Fideris, Bahnhof|
+---------+---------+---------+------+----------------+
only showing top 5 rows

In [5]:
import pyspark.sql.functions as F
import math

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
df_stations = metadata.withColumn("dlon", F.radians(F.col("Longitude")) - math.radians(8.540192)) \
             .withColumn("dlat", F.radians(F.col("Latitude")) - math.radians(47.378177)) \
             .withColumn("Distance_from_Zurich", F.asin(F.sqrt( F.sin(F.col("dlat") / 2) ** 2 + math.cos(math.radians(47.378177))
                                               *F.cos(F.radians(F.col("Latitude"))) * F.sin(F.col("dlon") / 2) ** 2)) * 2 * 3963 * 5280) \
             .drop("dlon", "dlat") \
             .filter(F.col("Distance_from_Zurich")<15000)

zurich_stations = list(df_stations.select('Remark').toPandas()['Remark'])
print(len(zurich_stations))
df_all =sbb.where(sbb["HALTESTELLEN_NAME"].isin(zurich_stations))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [123]:
df_stations.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+---------+------+--------------------+--------------------+
|StationID|Longitude| Latitude|Height|              Remark|Distance_from_Zurich|
+---------+---------+---------+------+--------------------+--------------------+
|  0000176| 8.521961|47.351679|     0|Zimmerberg-Basist...|  10676.361524930166|
|  8502572| 8.513918|47.370293|   421|Zürich, Goldbrunn...|   7107.377290914878|
|  8503000| 8.540192|47.378177|   408|           Zürich HB|2.323101710999253...|
|  8503001| 8.488940|47.391481|   399|   Zürich Altstetten|  13572.481490959273|
|  8503003| 8.548466|47.366611|   411|  Zürich Stadelhofen|   4693.551820280705|
|  8503004| 8.561372|47.350124|   408|Zürich Tiefenbrunnen|   11506.98247693669|
|  8503006| 8.544115|47.411529|   442|     Zürich Oerlikon|  12218.830294000149|
|  8503007| 8.544636|47.418747|   442|      Zürich Seebach|  14856.992780652563|
|  8503009| 8.533588|47.347440|   409|  Zürich Wollishofen|  11343.522830011038|
|  8503010| 8.530805|47.3640

In [73]:
test = df_stations.select('StationID','Longitude','Latitude')
joinedDF = test.crossJoin(test).toDF('id1','lon1','lat1','id2','lon2','lat2')
joinedDF.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------+---------+-------+--------+---------+
|    id1|    lon1|     lat1|    id2|    lon2|     lat2|
+-------+--------+---------+-------+--------+---------+
|0000176|8.521961|47.351679|0000176|8.521961|47.351679|
|0000176|8.521961|47.351679|8502572|8.513918|47.370293|
|0000176|8.521961|47.351679|8503000|8.540192|47.378177|
|0000176|8.521961|47.351679|8503001|8.488940|47.391481|
|0000176|8.521961|47.351679|8503003|8.548466|47.366611|
+-------+--------+---------+-------+--------+---------+
only showing top 5 rows

In [90]:
from pyspark.sql.types import IntegerType
distance = (
    joinedDF
        .withColumn("dlon", F.radians(F.col("lon1")) - F.radians(F.col("lon2")))
        .withColumn("dlat", F.radians(F.col("lat1")) - F.radians(F.col("lat2"))) 
        .withColumn("Distance", F.asin(F.sqrt( F.sin(F.col("dlat") / 2) ** 2 + F.cos(F.radians("lat2"))
                                           *F.cos(F.radians(F.col("lat1"))) * F.sin(F.col("dlon") / 2) ** 2)) * 2 * 3963 * 5280) \
        .drop("dlon", "dlat") 
        .filter(F.col("Distance")<500)
        .withColumn("Walking_time",F.round(2+F.col("Distance")/50).cast(IntegerType()))
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [91]:
distance.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------+---------+-------+--------+---------+------------------+------------+
|    id1|    lon1|     lat1|    id2|    lon2|     lat2|          Distance|Walking_time|
+-------+--------+---------+-------+--------+---------+------------------+------------+
|0000176|8.521961|47.351679|0000176|8.521961|47.351679|               0.0|           2|
|8502572|8.513918|47.370293|8502572|8.513918|47.370293|               0.0|           2|
|8503000|8.540192|47.378177|8503000|8.540192|47.378177|               0.0|           2|
|8503000|8.540192|47.378177|8503088|8.539170|47.377431|371.62272536447483|           9|
|8503000|8.540192|47.378177|8503446|8.541715|47.378846|448.94025446812276|          11|
|8503000|8.540192|47.378177|8587348|8.539338|47.377241| 401.8110520444312|          10|
|8503000|8.540192|47.378177|8587349|8.541742|47.377560| 444.6416307076613|          11|
|8503001|8.488940|47.391481|8503001|8.488940|47.391481|               0.0|           2|
|8503001|8.488940|47.391481|8591

**BETRIEBSTAG**: date of the trip

**FAHRT_BEZEICHNER**: identifies the trip

**BETREIBER_ABK, BETREIBER_NAME**: operator (name will contain the full name, e.g. Schweizerische Bundesbahnen for SBB)

**PRODUKT_ID**: type of transport, e.g. train, bus

**LINIEN_ID**: for trains, this is the train number

**LINIEN_TEXT,VERKEHRSMITTEL_TEXT**: for trains, the service type (IC, IR, RE, etc.)

**ZUSATZFAHRT_TF**: boolean, true if this is an additional trip (not part of the regular schedule)

**FAELLT_AUS_TF**: boolean, true if this trip failed (cancelled or not completed)

**HALTESTELLEN_NAME**: name of the stop

**ANKUNFTSZEIT**: arrival time at the stop according to schedule

**AN_PROGNOSE**: actual arrival time (when AN_PROGNOSE_STATUS is GESCHAETZT)

**AN_PROGNOSE_STATUS**: look only at lines when this is GESCHAETZT. This indicates that AN_PROGNOSE is the measured time of arrival.

**ABFAHRTSZEIT**: departure time at the stop according to schedule

**AB_PROGNOSE**: actual departure time (when AN_PROGNOSE_STATUS is GESCHAETZT)

**AB_PROGNOSE_STATUS**: look only at lines when this is GESCHAETZT. This indicates that AB_PROGNOSE is the measured time of arrival.

**DURCHFAHRT_TF**: boolean, true if the transport does not stop there

In [58]:
spark.catalog.clearCache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [41]:
# Cache a simpler dataframe with only relevant columns and without duplicated rows
simple_df = (
    df_all
    .withColumn('arrival', F.when(F.col('ankunftszeit') == '', None).otherwise(F.col('ankunftszeit')))
    .withColumn('departure', F.when(F.col('abfahrtszeit') == '', None).otherwise(F.col('abfahrtszeit'))) 
    .select('arrival','departure','BETRIEBSTAG','FAHRT_BEZEICHNER','LINIEN_ID','PRODUKT_ID','BPUIC','HALTESTELLEN_NAME')
    .toDF('Arrival_Time','Departure_Time','Day','Trip_ID','Line_ID', 'Type','Station_ID', 'Station_Name')
    .dropDuplicates()
    .cache()
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [42]:
# Remove rows correspoding to trips appearing only once on the dataframe
# These trips actually come from / go to stattions outside Zurich
ids = simple_df.groupBy('Trip_ID','Day').count()
ids = ids.where(ids['count']>1).select('Trip_ID').distinct()
df = simple_df.join(ids, "Trip_ID").orderBy('Trip_ID','Arrival_Time','Departure_Time')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
# Solve null values:
# Remove columns with more than two null-values
# Copy "Arrival_Time" value if "Departure_Time" is null and viceversa
df = df.dropna(thresh=1,subset=('Arrival_Time','Departure_Time')) \
        .withColumn("Departure",F.coalesce(df.Departure_Time,df.Arrival_Time))\
        .withColumn("Arrival", F.coalesce(df.Arrival_Time,df.Departure_Time))\
        .drop("Departure_Time","Arrival_Time")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [44]:
# Rank stops by departure_time for every trip and day 
from pyspark.sql import Window
trip_window = Window.partitionBy('Trip_ID','Day').orderBy(F.asc('Departure'))
trip_rank = F.rank().over(trip_window).alias('stop')
begin = df.select('*', trip_rank).alias('begin').orderBy('Trip_ID','Arrival','Departure').cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [45]:
begin.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+----------+-------+-----+----------+---------------+----------------+----------------+----+
|        Trip_ID|       Day|Line_ID| Type|Station_ID|   Station_Name|       Departure|         Arrival|stop|
+---------------+----------+-------+-----+----------+---------------+----------------+----------------+----+
|85:11:10040:001|01.12.2019|    Zug|10040|   8503006|Zürich Oerlikon|01.12.2019 22:15|01.12.2019 22:13|   1|
|85:11:10040:001|01.12.2019|    Zug|10040|   8503000|      Zürich HB|01.12.2019 22:21|01.12.2019 22:21|   2|
|85:11:10040:001|02.12.2019|    Zug|10040|   8503006|Zürich Oerlikon|02.12.2019 22:15|02.12.2019 22:13|   1|
|85:11:10040:001|02.12.2019|    Zug|10040|   8503000|      Zürich HB|02.12.2019 22:21|02.12.2019 22:21|   2|
|85:11:10040:001|03.12.2019|    Zug|10040|   8503006|Zürich Oerlikon|03.12.2019 22:15|03.12.2019 22:13|   1|
+---------------+----------+-------+-----+----------+---------------+----------------+----------------+----+
only showing top 5 

In [46]:
# Create dataframe for every connection 
end = begin.drop('Departure').withColumn('stop', begin.stop -1).alias('end')
data = begin.drop('Arrival').join(end, on=['stop','Day','Trip_ID','Type','Line_ID'])\
            .drop('stop').orderBy('Trip_ID','Arrival','Departure')\
            .toDF('Day','Trip_ID','Line_ID','Type','Start_ID','Start_Station','Start_Time','Stop_ID', 'Stop_Station','Stop_Time')\

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [47]:
data.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+---------------+-------+----+--------+---------------+----------------+-------+------------+----------------+
|       Day|        Trip_ID|Line_ID|Type|Start_ID|  Start_Station|      Start_Time|Stop_ID|Stop_Station|       Stop_Time|
+----------+---------------+-------+----+--------+---------------+----------------+-------+------------+----------------+
|01.12.2019|85:11:10040:001|  10040| Zug| 8503006|Zürich Oerlikon|01.12.2019 22:15|8503000|   Zürich HB|01.12.2019 22:21|
|02.12.2019|85:11:10040:001|  10040| Zug| 8503006|Zürich Oerlikon|02.12.2019 22:15|8503000|   Zürich HB|02.12.2019 22:21|
|03.12.2019|85:11:10040:001|  10040| Zug| 8503006|Zürich Oerlikon|03.12.2019 22:15|8503000|   Zürich HB|03.12.2019 22:21|
|04.12.2019|85:11:10040:001|  10040| Zug| 8503006|Zürich Oerlikon|04.12.2019 22:15|8503000|   Zürich HB|04.12.2019 22:21|
|05.12.2019|85:11:10040:001|  10040| Zug| 8503006|Zürich Oerlikon|05.12.2019 22:15|8503000|   Zürich HB|05.12.2019 22:21|
+----------+------------

In [50]:
data.where(data['Day']=='02.12.2019').where(data['Type']=='Zug').show(10) #.where(data['Trip_ID']=='85:11:14082:006').show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+---------------+-------+----+--------+------------------+----------------+-------+---------------+----------------+
|       Day|        Trip_ID|Line_ID|Type|Start_ID|     Start_Station|      Start_Time|Stop_ID|   Stop_Station|       Stop_Time|
+----------+---------------+-------+----+--------+------------------+----------------+-------+---------------+----------------+
|02.12.2019|85:11:10040:001|  10040| Zug| 8503006|   Zürich Oerlikon|02.12.2019 22:15|8503000|      Zürich HB|02.12.2019 22:21|
|02.12.2019|85:11:10049:001|  10049| Zug| 8503000|         Zürich HB|02.12.2019 22:39|8503006|Zürich Oerlikon|02.12.2019 22:45|
|02.12.2019|85:11:14016:004|  14016| Zug| 8503009|Zürich Wollishofen|02.12.2019 05:02|8503010|    Zürich Enge|02.12.2019 05:05|
|02.12.2019|85:11:14016:004|  14016| Zug| 8503010|       Zürich Enge|02.12.2019 05:06|8503011|Zürich Wiedikon|02.12.2019 05:07|
|02.12.2019|85:11:14016:004|  14016| Zug| 8503011|   Zürich Wiedikon|02.12.2019 05:07|8503000|      Züri

In [51]:
data.where(data['Day']=='02.12.2019').where(data['Type']=='Bus').drop('Trip_ID').show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+----------+----+--------+--------------------+----------------+-------+--------------------+----------------+
|       Day|   Line_ID|Type|Start_ID|       Start_Station|      Start_Time|Stop_ID|        Stop_Station|       Stop_Time|
+----------+----------+----+--------+--------------------+----------------+-------+--------------------+----------------+
|02.12.2019|85:773:787| Bus| 8591347|Zürich, Schürgist...|02.12.2019 20:48|8591047|    Zürich, Aubrücke|02.12.2019 20:49|
|02.12.2019|85:773:787| Bus| 8591047|    Zürich, Aubrücke|02.12.2019 20:49|8591225|Zürich, Genossens...|02.12.2019 20:52|
|02.12.2019|85:773:787| Bus| 8591225|Zürich, Genossens...|02.12.2019 20:52|8591318|    Zürich, Riedbach|02.12.2019 20:53|
|02.12.2019|85:773:787| Bus| 8591318|    Zürich, Riedbach|02.12.2019 20:53|8591172|   Zürich, Hagenholz|02.12.2019 20:54|
|02.12.2019|85:773:787| Bus| 8591172|   Zürich, Hagenholz|02.12.2019 20:54|8591256|Zürich, Leutschen...|02.12.2019 20:55|
|02.12.2019|85:773:787| 

In [52]:
data.where(data['Day']=='02.12.2019').where(data['Type']=='Tram').drop('Trip_ID').show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-----------+----+--------+--------------------+----------------+-------+--------------------+----------------+
|       Day|    Line_ID|Type|Start_ID|       Start_Station|      Start_Time|Stop_ID|        Stop_Station|       Stop_Time|
+----------+-----------+----+--------+--------------------+----------------+-------+--------------------+----------------+
|02.12.2019|85:3849:015|Tram| 8591101|Zürich, Bucheggplatz|02.12.2019 20:07|8591246|    Zürich, Laubiweg|02.12.2019 20:08|
|02.12.2019|85:3849:015|Tram| 8591246|    Zürich, Laubiweg|02.12.2019 20:09|8591335|Zürich, Schaffhau...|02.12.2019 20:10|
|02.12.2019|85:3849:015|Tram| 8591335|Zürich, Schaffhau...|02.12.2019 20:10|8591324|Zürich, Röslistrasse|02.12.2019 20:11|
|02.12.2019|85:3849:015|Tram| 8591324|Zürich, Röslistrasse|02.12.2019 20:11|8591298|Zürich, Ottikerst...|02.12.2019 20:12|
|02.12.2019|85:3849:015|Tram| 8591298|Zürich, Ottikerst...|02.12.2019 20:12|8591373|Zürich, Sonneggst...|02.12.2019 20:13|
|02.12.2019|85:3