# Data Generation

This notebook contains the code to generate some of the files used by our project website

In [1]:
import os
import json
# Initial Spark
import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import explode
import pyspark.sql.types as T
from pyspark.sql.functions import lit, col, to_date

from pyspark.sql.functions import avg
from pyspark.sql.functions import stddev
from pyspark.sql.functions import count, countDistinct, concat, sum
from pyspark.sql.functions import percentile_approx
import pyspark.sql.functions as F

In [2]:
DATA_FOLDER = "/Users/giacomoorsi/MEGAsync Downloads/Trenitalia-GenMar2023"
file = "all.parquet"

In [3]:

spark = SparkSession.builder \
    .master("local[4]") \
    .appName("Trenitalia") \
    .getOrCreate()

# set driver memory to 4GB
spark.sparkContext._conf.setAll([('spark.driver.memory', '4g')])

# get sc 
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/18 18:31:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [97]:
df = spark.read.parquet(os.path.join(DATA_FOLDER, "parquet", file))
print("Number of rows: {}".format(df.count()))

                                                                                

Number of rows: 8316723


In [98]:
# load the dataset of the stops
stops = spark.read.csv(os.path.join(DATA_FOLDER, "stops.csv"), header=True, inferSchema=True)

stops_column_renamer = {
    "name": "stop_name",
    "lat": "stop_lat",
    "lon": "stop_lon",
    "station_id": "stop_id", 
    "name_short": "stop_name_short",
    "id_region": "stop_id_region",
}

for k, v in stops_column_renamer.items():
    stops = stops.withColumnRenamed(k, v)

In [None]:
# number of days
print("Number of days: ", df.select("date").distinct().count())

# first date
print("First date: ", df.select("date").distinct().orderBy("date").first().asDict()["date"])

# last date
print("Last date: ", df.select("date").distinct().orderBy("date", ascending=False).first().asDict()["date"])

# number of trains
print("Number of trains: ", df.select("train_number").distinct().count())

# number of stops
print("Number of stops: ", df.select("stop_name").distinct().count())

# number of train classes
print("Number of train classes: ", df.select("train_class").distinct().count())


Number of days:  90
First date:  2023-01-01
Last date:  2023-03-31


                                                                                

Number of trains:  11496


                                                                                

Number of stops:  2291




Number of train classes:  11


                                                                                

In [None]:
df.columns

['train_arrival_stop_name',
 'train_class',
 'train_cn',
 'train_dl',
 'train_number',
 'train_arrival_time',
 'oae',
 'train_oaz',
 'train_od',
 'train_oo',
 'train_departure_time',
 'train_ope',
 'train_opz',
 'train_departure_stop_name',
 'train_pr',
 'train_arrival_delay',
 'train_departure_delay',
 'sea',
 'train_sep',
 'train_sub',
 'day',
 'month',
 'year',
 'date',
 'stop_name',
 'stop_arrival_time',
 'stop_departure_time',
 'stop_arrival_delay',
 'stop_departure_delay']

# 0. Preprocessing
As step of preprocessing, we remove all delays that are anomalous, i.e. they are not in the range [-100, 300] minutes. 

In [None]:
# remove all values of delays that are not in the range [-100, 300] if they are numerical
MIN_DELAY = -100
MAX_DELAY = 300
df = df.filter((col("stop_arrival_delay").cast("double").isNull()) | (col("stop_arrival_delay").cast("double") >= MIN_DELAY) & (col("stop_arrival_delay").cast("double") <= MAX_DELAY))


# 1. Statistics for each station

For each distinct station, we want to obtain: 
1. Station name
2. Latitude, longitude
3. Average arrival delay
4. Median arrival delay
5. % of trains with delay > 3
6. % of trains with delay > 5
7. % of trains with delay > 10
8. Number of distinct train numbers that stopped 

In [None]:
# add column True if train had > 3 minutes of delay
data_stop = df.join(stops, on="stop_name", how="inner")


data_stop = df \
    .filter(col("stop_arrival_delay").cast("double").isNotNull()) \
    .withColumn("stop_arrival_delay_double", col("stop_arrival_delay").cast("double")) \
    .drop("stop_arrival_delay") \
    .withColumnRenamed("stop_arrival_delay_double", "stop_arrival_delay") \
    .withColumn("3m_delay", col("stop_arrival_delay") > 3)\
    .withColumn("5m_delay", col("stop_arrival_delay") > 5)\
    .withColumn("10m_delay", col("stop_arrival_delay") > 10)\
    .withColumn("day_of_week", F.date_format(F.col("date"), "E"))\
    .withColumn("train_id", concat(col("train_class"), col("train_number")))

In [None]:
data_stop_stat = data_stop.groupBy("stop_name") \
    .agg(
        F.avg("stop_arrival_delay").alias("avg_arrival_delay"),
        F.percentile_approx("stop_arrival_delay", 0.5).alias("median_arrival_delay"),
        F.countDistinct("train_id").alias("count_trains"),
        F.count("train_id").alias("count_stops"),
        F.sum(F.col("3m_delay").cast("long")).alias("count_3m_delay"),
        F.sum(F.col("5m_delay").cast("long")).alias("count_5m_delay"),
        F.sum(F.col("10m_delay").cast("long")).alias("count_10m_delay"),
    )

In [None]:
print("Number of stops: ", data_stop_stat.count())

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:708)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:752)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:684)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:650)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:626)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:583)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:540)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)


Number of stops:  2202


                                                                                

In [None]:
print("Number of stops in (lat,long) dataset: ", stops.count())



Number of stops in (lat,long) dataset:  2962


                                                                                

In [None]:
print("Number of stops in final dataset: ", data_stop_stat.count())



Number of stops in final dataset:  2202


                                                                                

In [None]:
data_stop_pandas = data_stop_stat.toPandas()
data_stop_pandas.head(100)

                                                                                

Unnamed: 0,stop_name,avg_arrival_delay,median_arrival_delay,count_trains,count_stops,count_3m_delay,count_5m_delay,count_10m_delay
0,ABBASANTA,1.251249,0.0,29,1401,205,131,79
1,ABBIATEGRASSO,2.978809,2.0,57,3728,1105,596,209
2,ACQUAVIVA,3.013732,2.0,30,2039,721,327,61
3,ACQUAVIVA DELLE FONTI,1.041216,1.0,36,2402,158,74,35
4,ACQUEDOLCI-S.FRATELLO,0.272120,0.0,18,1198,93,57,22
...,...,...,...,...,...,...,...,...
95,CAMPODARSEGO,6.250000,5.0,8,8,6,2,1
96,CAMPOLIGURE MASONE,4.531998,3.0,24,1797,848,471,147
97,CAMUCIA CORTONA,3.785942,2.0,54,3770,1459,826,269
98,CANDELA,4.684211,3.0,8,532,255,175,43


In [None]:
# save as csv
data_stop_pandas.to_csv(("dataset_generated/data_stop/data_stop.csv"), index=False)

### b. Statistics for each day of week

In [None]:
data_stop_stat = data_stop.groupBy("stop_name", "day_of_week") \
    .agg(
        F.avg("stop_arrival_delay").alias("avg_arrival_delay"),
        F.percentile_approx("stop_arrival_delay", 0.5).alias("median_arrival_delay"),
        F.countDistinct("train_id").alias("count_trains"),
        F.count("train_id").alias("count_stops"),
        F.sum(F.col("3m_delay").cast("long")).alias("count_3m_delay"),
        F.sum(F.col("5m_delay").cast("long")).alias("count_5m_delay"),
        F.sum(F.col("10m_delay").cast("long")).alias("count_10m_delay"),
    )

In [None]:
data_stop_stat.show()

data_stop_stat_pandas = data_stop_stat.toPandas()

                                                                                

+--------------------+-----------+--------------------+--------------------+------------+-----------+--------------+--------------+---------------+
|           stop_name|day_of_week|   avg_arrival_delay|median_arrival_delay|count_trains|count_stops|count_3m_delay|count_5m_delay|count_10m_delay|
+--------------------+-----------+--------------------+--------------------+------------+-----------+--------------+--------------+---------------+
|         ABANO TERME|        Mon|  2.1349862258953167|                 2.0|          28|        363|            76|            51|             18|
|         ABANO TERME|        Tue|  1.3961218836565097|                 2.0|          28|        361|            75|            44|              6|
|         ABANO TERME|        Wed|  1.6565096952908587|                 2.0|          28|        361|            82|            52|              8|
|     ABBADIA LARIANA|        Tue|              1.0875|                 1.0|          25|        320|           

                                                                                

In [None]:
# create a file for each day of week with the statistics
for day in data_stop_stat_pandas["day_of_week"].unique():
    data_stop_stat_pandas[data_stop_stat_pandas["day_of_week"] == day].to_csv(("dataset_generated/data_stop/data_stop_{}.csv".format(day)), index=False)
    

### 1c. Statistics for each train type

In [None]:
data_stop_stat = data_stop.groupBy("stop_name", "train_class") \
    .agg(
        F.avg("stop_arrival_delay").alias("avg_arrival_delay"),
        F.percentile_approx("stop_arrival_delay", 0.5).alias("median_arrival_delay"),
        F.countDistinct("train_id").alias("count_trains"),
        F.count("train_id").alias("count_stops"),
        F.sum(F.col("3m_delay").cast("long")).alias("count_3m_delay"),
        F.sum(F.col("5m_delay").cast("long")).alias("count_5m_delay"),
        F.sum(F.col("10m_delay").cast("long")).alias("count_10m_delay"),
    )

In [None]:
data_stop_stat.show()

data_stop_stat_pandas = data_stop_stat.toPandas()

                                                                                

+--------------------+-----------+--------------------+--------------------+------------+-----------+--------------+--------------+---------------+
|           stop_name|train_class|   avg_arrival_delay|median_arrival_delay|count_trains|count_stops|count_3m_delay|count_5m_delay|count_10m_delay|
+--------------------+-----------+--------------------+--------------------+------------+-----------+--------------+--------------+---------------+
|           ABBASANTA|        REG|   1.251249107780157|                 0.0|          29|       1401|           205|           131|             79|
|    ABBIATE GUAZZONE|        REG|0.001497902935889...|                 0.0|          61|       3338|             0|             0|              0|
|ACQUAVIVA DELLE F...|        REG|  1.0412156536219817|                 1.0|          36|       2402|           158|            74|             35|
|         ACQUI TERME|        REG|   3.531089978054133|                 3.0|          41|       2734|           

                                                                                

In [None]:
# create a file for each day of week with the statistics
for train_class in data_stop_stat_pandas["train_class"].unique():
    data_stop_stat_pandas[data_stop_stat_pandas["train_class"] == train_class].to_csv(("dataset_generated/data_stop/data_stop_class_{}.csv".format(train_class)), index=False)
    

### 1d. Statistics for each week day and train type


In [None]:
data_stop_stat = data_stop.groupBy("stop_name", "train_class", "day_of_week") \
    .agg(
        F.avg("stop_arrival_delay").alias("avg_arrival_delay"),
        F.percentile_approx("stop_arrival_delay", 0.5).alias("median_arrival_delay"),
        F.countDistinct("train_id").alias("count_trains"),
        F.count("train_id").alias("count_stops"),
        F.sum(F.col("3m_delay").cast("long")).alias("count_3m_delay"),
        F.sum(F.col("5m_delay").cast("long")).alias("count_5m_delay"),
        F.sum(F.col("10m_delay").cast("long")).alias("count_10m_delay"),
    )

In [None]:
data_stop_stat_pandas = data_stop_stat.toPandas()

                                                                                

In [None]:
# for each combination of weekday and train_class, create a file with the statistics
for day in data_stop_stat_pandas["day_of_week"].unique():
    for train_class in data_stop_stat_pandas["train_class"].unique():
        data_stop_stat_pandas[(data_stop_stat_pandas["day_of_week"] == day) & (data_stop_stat_pandas["train_class"] == train_class)].to_csv(("dataset_generated/data_stop/data_stop_mix_{}_{}.csv".format(day, train_class)), index=False)


---