# To Begin With...

### Name your spark application as `GASPAR_final` or `GROUP_NAME_final`.

<div class='alert alert-info'><b>Any application without a proper name would be promptly killed.</b></div>

In [118]:
%%configure
{"conf": {
    "spark.app.name": "datavirus_final"
}}

A session has already been started. If you intend to recreate the session with new configurations, please include the -f argument.


### Start Spark

In [1]:
# Initialization
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
4227,application_1587988164357_0855,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f5ce35a1150>

### Read the [SBB actual data](https://opentransportdata.swiss/en/dataset/istdaten) in ORC format

In [2]:
sbb = spark.read.orc('/data/sbb/orc/istdaten')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
sbb.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- betriebstag: string (nullable = true)
 |-- fahrt_bezeichner: string (nullable = true)
 |-- betreiber_id: string (nullable = true)
 |-- betreiber_abk: string (nullable = true)
 |-- betreiber_name: string (nullable = true)
 |-- produkt_id: string (nullable = true)
 |-- linien_id: string (nullable = true)
 |-- linien_text: string (nullable = true)
 |-- umlauf_id: string (nullable = true)
 |-- verkehrsmittel_text: string (nullable = true)
 |-- zusatzfahrt_tf: string (nullable = true)
 |-- faellt_aus_tf: string (nullable = true)
 |-- bpuic: string (nullable = true)
 |-- haltestellen_name: string (nullable = true)
 |-- ankunftszeit: string (nullable = true)
 |-- an_prognose: string (nullable = true)
 |-- an_prognose_status: string (nullable = true)
 |-- abfahrtszeit: string (nullable = true)
 |-- ab_prognose: string (nullable = true)
 |-- ab_prognose_status: string (nullable = true)
 |-- durchfahrt_tf: string (nullable = true)

In [5]:
sbb.head(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(betriebstag=u'03.09.2018', fahrt_bezeichner=u'80:06____:17010:000', betreiber_id=u'80:06____', betreiber_abk=u'DB', betreiber_name=u'DB Regio AG', produkt_id=u'Zug', linien_id=u'17010', linien_text=u'RE', umlauf_id=u'', verkehrsmittel_text=u'RE', zusatzfahrt_tf=u'false', faellt_aus_tf=u'false', bpuic=u'8500090', haltestellen_name=u'Basel Bad Bf', ankunftszeit=u'', an_prognose=u'', an_prognose_status=u'PROGNOSE', abfahrtszeit=u'03.09.2018 05:45', ab_prognose=u'', ab_prognose_status=u'UNBEKANNT', durchfahrt_tf=u'false'), Row(betriebstag=u'03.09.2018', fahrt_bezeichner=u'80:06____:17012:000', betreiber_id=u'80:06____', betreiber_abk=u'DB', betreiber_name=u'DB Regio AG', produkt_id=u'Zug', linien_id=u'17012', linien_text=u'RE', umlauf_id=u'', verkehrsmittel_text=u'RE', zusatzfahrt_tf=u'false', faellt_aus_tf=u'false', bpuic=u'8500090', haltestellen_name=u'Basel Bad Bf', ankunftszeit=u'', an_prognose=u'', an_prognose_status=u'PROGNOSE', abfahrtszeit=u'03.09.2018 06:34', ab_prognose=u'',

### Read the station list data [BFKOORD_GEO](https://opentransportdata.swiss/en/cookbook/hafas-rohdaten-format-hrdf/#Abgrenzung)

In [6]:
metadata = spark.read.csv('/data/sbb/stations/bfkoordgeo.csv', header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
metadata.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- StationID: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Height: string (nullable = true)
 |-- Remark: string (nullable = true)

In [8]:
metadata.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+---------+------+----------------+
|StationID|Longitude| Latitude|Height|          Remark|
+---------+---------+---------+------+----------------+
|  0000002|26.074412|44.446770|     0|       Bucuresti|
|  0000003| 1.811446|50.901549|     0|          Calais|
|  0000004| 1.075329|51.284212|     0|      Canterbury|
|  0000005|-3.543547|50.729172|     0|          Exeter|
|  0000007| 9.733756|46.922368|   744|Fideris, Bahnhof|
+---------+---------+---------+------+----------------+
only showing top 5 rows

In [9]:
from pyspark.sql.functions import col, radians, asin, sin, sqrt, cos
import math

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
df_stations = metadata.withColumn("dlon", radians(col("Longitude")) - math.radians(8.540192)) \
             .withColumn("dlat", radians(col("Latitude")) - math.radians(47.378177)) \
             .withColumn("Distance_from_Zurich", asin(sqrt( sin(col("dlat") / 2) ** 2 + math.cos(math.radians(47.378177))
                                               *cos(radians(col("Latitude"))) * sin(col("dlon") / 2) ** 2)) * 2 * 3963 * 5280) \
             .drop("dlon", "dlat") \
             .filter(col("Distance_from_Zurich")<15000)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
df_stations.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+---------+------+--------------------+--------------------+
|StationID|Longitude| Latitude|Height|              Remark|Distance_from_Zurich|
+---------+---------+---------+------+--------------------+--------------------+
|  0000176| 8.521961|47.351679|     0|Zimmerberg-Basist...|  10676.361524930166|
|  8502572| 8.513918|47.370293|   421|Zürich, Goldbrunn...|   7107.377290914878|
|  8503000| 8.540192|47.378177|   408|           Zürich HB|2.323101710999253...|
|  8503001| 8.488940|47.391481|   399|   Zürich Altstetten|  13572.481490959273|
|  8503003| 8.548466|47.366611|   411|  Zürich Stadelhofen|   4693.551820280705|
|  8503004| 8.561372|47.350124|   408|Zürich Tiefenbrunnen|   11506.98247693669|
|  8503006| 8.544115|47.411529|   442|     Zürich Oerlikon|  12218.830294000149|
|  8503007| 8.544636|47.418747|   442|      Zürich Seebach|  14856.992780652563|
|  8503009| 8.533588|47.347440|   409|  Zürich Wollishofen|  11343.522830011038|
|  8503010| 8.530805|47.3640

In [13]:
zurich_stations = list(df_stations.select('Remark').toPandas()['Remark'])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [32]:
df =sbb.where(sbb["HALTESTELLEN_NAME"].isin(zurich_stations))
df_trains=df.where(df['PRODUKT_ID']=='Zug')
#zurich_trips_day = zurich_trips.where(zurich_trips['BETRIEBSTAG']=='13.05.2019')  #typical data 13-17 May 2019

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

**BETRIEBSTAG**: date of the trip

**FAHRT_BEZEICHNER**: identifies the trip

**BETREIBER_ABK, BETREIBER_NAME**: operator (name will contain the full name, e.g. Schweizerische Bundesbahnen for SBB)

**PRODUKT_ID**: type of transport, e.g. train, bus

**LINIEN_ID**: for trains, this is the train number

**LINIEN_TEXT,VERKEHRSMITTEL_TEXT**: for trains, the service type (IC, IR, RE, etc.)

**ZUSATZFAHRT_TF**: boolean, true if this is an additional trip (not part of the regular schedule)

**FAELLT_AUS_TF**: boolean, true if this trip failed (cancelled or not completed)

**HALTESTELLEN_NAME**: name of the stop

**ANKUNFTSZEIT**: arrival time at the stop according to schedule

**AN_PROGNOSE**: actual arrival time (when AN_PROGNOSE_STATUS is GESCHAETZT)

**AN_PROGNOSE_STATUS**: look only at lines when this is GESCHAETZT. This indicates that AN_PROGNOSE is the measured time of arrival.

**ABFAHRTSZEIT**: departure time at the stop according to schedule

**AB_PROGNOSE**: actual departure time (when AN_PROGNOSE_STATUS is GESCHAETZT)

**AB_PROGNOSE_STATUS**: look only at lines when this is GESCHAETZT. This indicates that AB_PROGNOSE is the measured time of arrival.

**DURCHFAHRT_TF**: boolean, true if the transport does not stop there

In [120]:
simple_df = (
    df_trains
    .select('ABFAHRTSZEIT','ANKUNFTSZEIT','BETRIEBSTAG','FAHRT_BEZEICHNER','LINIEN_ID','BPUIC','HALTESTELLEN_NAME')
    .toDF('Departure_time','Arrival_time','Day','Trip_ID','Train_number', 'Station_ID', 'Station_Name')
    .dropDuplicates()
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [134]:
ids = simple_df.groupBy('Trip_ID','Day').count()
ids = ids.where(ids['count']>1).select('Trip_ID').distinct()
df = simple_df.join(ids, "Trip_ID").orderBy('Trip_ID','Arrival_Time','Departure_Time')
df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+----------------+----------------+----------+------------+----------+---------------+
|        Trip_ID|  Departure_time|    Arrival_time|       Day|Train_number|Station_ID|   Station_Name|
+---------------+----------------+----------------+----------+------------+----------+---------------+
|85:11:10040:001|01.12.2019 22:15|01.12.2019 22:13|01.12.2019|       10040|   8503006|Zürich Oerlikon|
|85:11:10040:001|                |01.12.2019 22:21|01.12.2019|       10040|   8503000|      Zürich HB|
|85:11:10040:001|02.12.2019 22:15|02.12.2019 22:13|02.12.2019|       10040|   8503006|Zürich Oerlikon|
|85:11:10040:001|                |02.12.2019 22:21|02.12.2019|       10040|   8503000|      Zürich HB|
|85:11:10040:001|03.12.2019 22:15|03.12.2019 22:13|03.12.2019|       10040|   8503006|Zürich Oerlikon|
|85:11:10040:001|                |03.12.2019 22:21|03.12.2019|       10040|   8503000|      Zürich HB|
|85:11:10040:001|04.12.2019 22:15|04.12.2019 22:13|04.12.2019|       1004

In [135]:
train = df.where(df['Trip_ID']=='85:11:10040:001')
e = train.where(train['Day']=='02.12.2019')#.sort('Departure_Time',ascendent=True)
e.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+----------------+----------------+----------+------------+----------+---------------+
|        Trip_ID|  Departure_time|    Arrival_time|       Day|Train_number|Station_ID|   Station_Name|
+---------------+----------------+----------------+----------+------------+----------+---------------+
|85:11:10040:001|02.12.2019 22:15|02.12.2019 22:13|02.12.2019|       10040|   8503006|Zürich Oerlikon|
|85:11:10040:001|                |02.12.2019 22:21|02.12.2019|       10040|   8503000|      Zürich HB|
+---------------+----------------+----------------+----------+------------+----------+---------------+