# Spark Kernel for getting stations and walking connections



### Start Spark

In [1]:
%%configure
{"conf": {
    "spark.app.name": "datavirus_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6849,application_1589299642358_1346,pyspark,idle,Link,Link,
6858,application_1589299642358_1352,pyspark,idle,Link,Link,
6866,application_1589299642358_1360,pyspark,idle,Link,Link,
6867,application_1589299642358_1361,pyspark,idle,Link,Link,
6869,application_1589299642358_1363,pyspark,idle,Link,Link,
6871,application_1589299642358_1365,pyspark,busy,Link,Link,
6872,application_1589299642358_1366,pyspark,idle,Link,Link,
6875,application_1589299642358_1369,pyspark,idle,Link,Link,
6876,application_1589299642358_1370,pyspark,busy,Link,Link,
6877,application_1589299642358_1371,pyspark,idle,Link,Link,


In [2]:
# Initialization
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6880,application_1589299642358_1374,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f866993e850>

In [3]:
import pyspark.sql.functions as F
import math

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Read the station list data from [BFKOORD_GEO](https://opentransportdata.swiss/en/cookbook/hafas-rohdaten-format-hrdf/#Abgrenzung)

In [4]:
metadata = spark.read.csv('/data/sbb/stations/bfkoordgeo.csv', header=True)
metadata.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+---------+------+----------------+
|StationID|Longitude| Latitude|Height|          Remark|
+---------+---------+---------+------+----------------+
|  0000002|26.074412|44.446770|     0|       Bucuresti|
|  0000003| 1.811446|50.901549|     0|          Calais|
|  0000004| 1.075329|51.284212|     0|      Canterbury|
|  0000005|-3.543547|50.729172|     0|          Exeter|
|  0000007| 9.733756|46.922368|   744|Fideris, Bahnhof|
+---------+---------+---------+------+----------------+
only showing top 5 rows

In [5]:
df_stations = metadata.withColumn("dlon", F.radians(F.col("Longitude")) - math.radians(8.540192)) \
             .withColumn("dlat", F.radians(F.col("Latitude")) - math.radians(47.378177)) \
             .withColumn("Distance_from_Zurich", F.asin(F.sqrt( F.sin(F.col("dlat") / 2) ** 2 + math.cos(math.radians(47.378177))
                                               *F.cos(F.radians(F.col("Latitude"))) * F.sin(F.col("dlon") / 2) ** 2)) * 2 * 3963 * 5280) \
             .drop("dlon", "dlat") \
             .filter(F.col("Distance_from_Zurich")<15000)

#zurich_stations = list(df_stations.select('Remark').toPandas()['Remark'])
df_stations.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+---------+------+--------------------+--------------------+
|StationID|Longitude| Latitude|Height|              Remark|Distance_from_Zurich|
+---------+---------+---------+------+--------------------+--------------------+
|  0000176| 8.521961|47.351679|     0|Zimmerberg-Basist...|  10676.361524930166|
|  8502572| 8.513918|47.370293|   421|Zürich, Goldbrunn...|   7107.377290914878|
|  8503000| 8.540192|47.378177|   408|           Zürich HB|2.323101710999253...|
|  8503001| 8.488940|47.391481|   399|   Zürich Altstetten|  13572.481490959273|
|  8503003| 8.548466|47.366611|   411|  Zürich Stadelhofen|   4693.551820280705|
+---------+---------+---------+------+--------------------+--------------------+
only showing top 5 rows

### Read the station list data from the time-table stops

In [6]:
stop_metadata = spark.read.orc("hdfs:///data/sbb/timetables/orc/stops")
stop_metadata.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------------------+----------------+----------------+-------------+--------------+
|stop_id|           stop_name|        stop_lat|        stop_lon|location_type|parent_station|
+-------+--------------------+----------------+----------------+-------------+--------------+
|1322000|            Altoggio|46.1672513851495|  8.345807131427|             |              |
|1322001|        Antronapiana| 46.060121674738|8.11361957990831|             |              |
|1322002|              Anzola|45.9898698225697|8.34571729989858|             |              |
|1322003|              Baceno|46.2614983591677|8.31925293162473|             |              |
|1322004|Beura Cardezza, c...|46.0790618438814|8.29927439970313|             |              |
+-------+--------------------+----------------+----------------+-------------+--------------+
only showing top 5 rows

In [7]:
df_stops = stop_metadata.withColumn("dlon", F.radians(F.col("stop_lon")) - math.radians(8.540192)) \
                 .withColumn("dlat", F.radians(F.col("stop_lat")) - math.radians(47.378177)) \
                 .withColumn("Distance_from_Zurich", F.asin(F.sqrt( F.sin(F.col("dlat") / 2) ** 2 + math.cos(math.radians(47.378177))
                                               *F.cos(F.radians(F.col("stop_lat"))) * F.sin(F.col("dlon") / 2) ** 2)) * 2 * 3963 * 5280) \
                 .drop("dlon", "dlat") \
                 .filter(F.col("Distance_from_Zurich")<15000)
df_stops.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+--------------------+----------------+----------------+-------------+--------------+--------------------+
|     stop_id|           stop_name|        stop_lat|        stop_lon|location_type|parent_station|Distance_from_Zurich|
+------------+--------------------+----------------+----------------+-------------+--------------+--------------------+
|     8502495|Zürich Wollishofe...|47.3476976601166|8.53331248070737|             |              |  11260.511488118353|
|     8502572|Zürich, Goldbrunn...|47.3702920484894|8.51391785372053|             |              |   7107.551200745138|
|     8503000|           Zürich HB|47.3781762039461|8.54019357578468|             |      8503000P|  0.4861881116144193|
|8503000:0:10|           Zürich HB|47.3794536181612|8.54019357578468|             |      8503000P|  466.22604663935687|
|8503000:0:11|           Zürich HB|47.3795144466376|8.54019357578468|             |      8503000P|   488.4408337792945|
+------------+--------------------+-----

In [8]:
#proof that there are many rows for the same station (for example, Zurich HB)
df_stops.where(F.col('stop_name')=='Zürich HB').show(5) 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+---------+----------------+----------------+-------------+--------------+--------------------+
|     stop_id|stop_name|        stop_lat|        stop_lon|location_type|parent_station|Distance_from_Zurich|
+------------+---------+----------------+----------------+-------------+--------------+--------------------+
|     8503000|Zürich HB|47.3781762039461|8.54019357578468|             |      8503000P|  0.4861881116144193|
|8503000:0:10|Zürich HB|47.3794536181612|8.54019357578468|             |      8503000P|  466.22604663935687|
|8503000:0:11|Zürich HB|47.3795144466376|8.54019357578468|             |      8503000P|   488.4408337792945|
|8503000:0:12|Zürich HB|47.3786020121232|8.54019357578468|             |      8503000P|  155.21655831042875|
|8503000:0:13|Zürich HB|47.3785411825942|8.54019357578468|             |      8503000P|   133.0014610545808|
+------------+---------+----------------+----------------+-------------+--------------+--------------------+
only showing top 5 

### Compare both dataframes to see which stations are missing

In [9]:
only_names_stations = df_stations.select('StationID').distinct()
only_names_stops =  df_stops.where(F.length('stop_id')==7).select('stop_id')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
missing_ids = only_names_stations.subtract(only_names_stops)
missing_stations = df_stations.join(missing_ids,on =['StationID'])
missing_stations.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+---------+---------+------+--------------------+--------------------+
|StationID|Longitude| Latitude|Height|              Remark|Distance_from_Zurich|
+---------+---------+---------+------+--------------------+--------------------+
|  0000176| 8.521961|47.351679|     0|Zimmerberg-Basist...|  10676.361524930166|
|  8503001| 8.488940|47.391481|   399|   Zürich Altstetten|  13572.481490959273|
|  8503006| 8.544115|47.411529|   442|     Zürich Oerlikon|  12218.830294000149|
|  8503007| 8.544636|47.418747|   442|      Zürich Seebach|  14856.992780652563|
|  8503015| 8.529359|47.393032|   425|    Zürich Wipkingen|  6050.3533490488735|
|  8503020| 8.517106|47.385195|   415|   Zürich Hardbrücke|   6257.735615033529|
|  8503069| 8.583204|47.351016|   524|       Zürich Rehalp|   14546.26629164279|
|  8503088| 8.539170|47.377431|   396|       Zürich HB SZU|    371.622725366178|
|  8530471| 8.546402|47.376858|   446|         Hochschulen|  1609.5264027953037|
|  8530472| 8.548920|47.3858

### Create CSV file for storing the stations 

In [11]:
%%spark -o df_stations -n -1

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
%%local
df_stations.to_csv("../data/Zurich_Stations.csv", index=False)

### Get the walking transfer times between different stations

In [13]:
station_pairs = df_stations.select('StationID','Longitude','Latitude','Height')
joinedDF = station_pairs.crossJoin(station_pairs).toDF('id1','lon1','lat1','h1','id2','lon2','lat2','h2')
joinedDF.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------+---------+---+-------+--------+---------+---+
|    id1|    lon1|     lat1| h1|    id2|    lon2|     lat2| h2|
+-------+--------+---------+---+-------+--------+---------+---+
|0000176|8.521961|47.351679|  0|0000176|8.521961|47.351679|  0|
|0000176|8.521961|47.351679|  0|8502572|8.513918|47.370293|421|
|0000176|8.521961|47.351679|  0|8503000|8.540192|47.378177|408|
|0000176|8.521961|47.351679|  0|8503001|8.488940|47.391481|399|
|0000176|8.521961|47.351679|  0|8503003|8.548466|47.366611|411|
+-------+--------+---------+---+-------+--------+---------+---+
only showing top 5 rows

In [41]:
from pyspark.sql.types import FloatType

distance = (
    joinedDF
        .withColumn("dlon", F.radians(F.col("lon1")) - F.radians(F.col("lon2")))
        .withColumn("dlat", F.radians(F.col("lat1")) - F.radians(F.col("lat2"))) 
        .withColumn("Distance", F.asin(F.sqrt( F.sin(F.col("dlat") / 2) ** 2 + F.cos(F.radians("lat2"))
                                           *F.cos(F.radians(F.col("lat1"))) * F.sin(F.col("dlon") / 2) ** 2)) * 2 * 3963 * 5280)  
        .filter(F.col("Distance")<500)
        .withColumn("dh", F.col("h2")-F.col("h1"))
        .withColumn("Distance(m)", F.sqrt(F.pow(F.col("Distance"),2)+F.pow(F.col("dh"),2)))
        .withColumn("Walking_time",F.round(60*(2+F.col("Distance")/50)).cast(FloatType()))
        .withColumn("speed",50-10*F.col("dh")/200)
        .withColumn("Transfer_time (s)", F.round(60*(2+(F.col("Distance(m)")/F.col("speed")).cast(FloatType()))))
        .drop("dlon", "dlat","dh","speed","Distance","Walking_time")
)

distance.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------+---------+---+-------+--------+---------+---+------------------+-----------------+
|    id1|    lon1|     lat1| h1|    id2|    lon2|     lat2| h2|       Distance(m)|Transfer_time (s)|
+-------+--------+---------+---+-------+--------+---------+---+------------------+-----------------+
|0000176|8.521961|47.351679|  0|0000176|8.521961|47.351679|  0|               0.0|            120.0|
|8502572|8.513918|47.370293|421|8502572|8.513918|47.370293|421|               0.0|            120.0|
|8503000|8.540192|47.378177|408|8503000|8.540192|47.378177|408|               0.0|            120.0|
|8503000|8.540192|47.378177|408|8503088|8.539170|47.377431|396|371.81641976561485|            561.0|
|8503000|8.540192|47.378177|408|8503446|8.541715|47.378846|480|454.67719547158157|            708.0|
|8503000|8.540192|47.378177|408|8587348|8.539338|47.377241|408| 401.8110520444312|            602.0|
|8503000|8.540192|47.378177|408|8587349|8.541742|47.377560|408| 444.6416307076613|         

In [42]:
%%spark -o distance -n -1

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
%%local
distance.to_csv("../data/Zurich_WalkingConnections.csv", index=False)

In [44]:
distance.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------+---------+---+-------+--------+---------+---+------------------+-----------------+
|    id1|    lon1|     lat1| h1|    id2|    lon2|     lat2| h2|       Distance(m)|Transfer_time (s)|
+-------+--------+---------+---+-------+--------+---------+---+------------------+-----------------+
|0000176|8.521961|47.351679|  0|0000176|8.521961|47.351679|  0|               0.0|            120.0|
|8502572|8.513918|47.370293|421|8502572|8.513918|47.370293|421|               0.0|            120.0|
|8503000|8.540192|47.378177|408|8503000|8.540192|47.378177|408|               0.0|            120.0|
|8503000|8.540192|47.378177|408|8503088|8.539170|47.377431|396|371.81641976561485|            561.0|
|8503000|8.540192|47.378177|408|8503446|8.541715|47.378846|480|454.67719547158157|            708.0|
+-------+--------+---------+---+-------+--------+---------+---+------------------+-----------------+
only showing top 5 rows