# Data Generation

This notebook contains the code to generate some of the files used by our project website

In [16]:
import os
import json
# Initial Spark
import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import explode
import pyspark.sql.types as T
from pyspark.sql.functions import lit, col, to_date

from pyspark.sql.functions import avg
from pyspark.sql.functions import stddev
from pyspark.sql.functions import count, countDistinct, concat, sum
from pyspark.sql.functions import percentile_approx
import pyspark.sql.functions as F

In [2]:
DATA_FOLDER = "/Users/giacomoorsi/MEGAsync Downloads/Trenitalia-GenMar2023"
file = "all.parquet"

In [3]:

spark = SparkSession.builder \
    .master("local[4]") \
    .appName("Trenitalia") \
    .getOrCreate()

# set driver memory to 4GB
spark.sparkContext._conf.setAll([('spark.driver.memory', '4g')])

# get sc 
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/22 20:25:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
df = spark.read.parquet(os.path.join(DATA_FOLDER, "parquet", file))
print("Number of rows: {}".format(df.count()))

[Stage 2:>                                                          (0 + 4) / 4]

Number of rows: 8316723


                                                                                

In [7]:
# load the dataset of the stops
stops = spark.read.csv(os.path.join(DATA_FOLDER, "stops.csv"), header=True, inferSchema=True)

stops_column_renamer = {
    "name": "stop_name",
    "lat": "stop_lat",
    "lon": "stop_lon",
    "station_id": "stop_id", 
    "name_short": "stop_name_short",
    "id_region": "stop_id_region",
}

for k, v in stops_column_renamer.items():
    stops = stops.withColumnRenamed(k, v)

In [8]:
# number of days
print("Number of days: ", df.select("date").distinct().count())

# first date
print("First date: ", df.select("date").distinct().orderBy("date").first().asDict()["date"])

# last date
print("Last date: ", df.select("date").distinct().orderBy("date", ascending=False).first().asDict()["date"])

# number of trains
print("Number of trains: ", df.select("train_number").distinct().count())

# number of stops
print("Number of stops: ", df.select("stop_name").distinct().count())

# number of train classes
print("Number of train classes: ", df.select("train_class").distinct().count())


                                                                                

Number of days:  90


                                                                                

First date:  2023-01-01


                                                                                

Last date:  2023-03-31


                                                                                

Number of trains:  11496


                                                                                

Number of stops:  2291
Number of train classes:  11


In [10]:
df.columns

['train_arrival_stop_name',
 'train_class',
 'train_cn',
 'train_dl',
 'train_number',
 'train_arrival_time',
 'oae',
 'train_oaz',
 'train_od',
 'train_oo',
 'train_departure_time',
 'train_ope',
 'train_opz',
 'train_departure_stop_name',
 'train_pr',
 'train_arrival_delay',
 'train_departure_delay',
 'sea',
 'train_sep',
 'train_sub',
 'day',
 'month',
 'year',
 'date',
 'stop_name',
 'stop_arrival_time',
 'stop_departure_time',
 'stop_arrival_delay',
 'stop_departure_delay']

# 1. Statistics for each station

For each distinct station, we want to obtain: 
1. Station name
2. Latitude, longitude
3. Average arrival delay
4. Median arrival delay
5. % of trains with delay > 3
6. % of trains with delay > 5
7. % of trains with delay > 10
8. Number of distinct train numbers that stopped 

In [33]:
# add column True if train had > 3 minutes of delay
data_stop = df \
    .withColumn("3m_delay", col("stop_arrival_delay") > 3)\
    .withColumn("5m_delay", col("stop_arrival_delay") > 5)\
    .withColumn("10m_delay", col("stop_arrival_delay") > 10)\
    .withColumn("train_id", concat(col("train_class"), col("train_number")))

In [34]:
data_stop = data_stop.groupBy("stop_name") \
    .agg(
        F.avg("stop_arrival_delay").alias("avg_arrival_delay"),
        F.percentile_approx("stop_arrival_delay", 0.5).alias("median_arrival_delay"),
        F.countDistinct("train_id").alias("count_trains"),
        F.count("train_id").alias("count_stops"),
        F.sum(F.col("3m_delay").cast("long")).alias("count_3m_delay"),
        F.sum(F.col("5m_delay").cast("long")).alias("count_5m_delay"),
        F.sum(F.col("10m_delay").cast("long")).alias("count_10m_delay"),
    )

In [35]:
print("Number of stops: ", data_stop.count())

[Stage 83:>                                                         (0 + 4) / 4]

Number of stops:  2291


                                                                                

In [36]:
print("Number of stops in (lat,long) dataset: ", stops.count())

Number of stops in (lat,long) dataset:  2962


In [37]:
data_stop_merged = data_stop.join(stops, on="stop_name", how="inner")
print("Number of stops in final dataset: ", data_stop_merged.count())

                                                                                

Number of stops in final dataset:  2109


In [38]:
data_stop_pandas = data_stop_merged.toPandas()
data_stop_pandas.head(100)

                                                                                

Unnamed: 0,stop_name,avg_arrival_delay,median_arrival_delay,count_trains,count_stops,count_3m_delay,count_5m_delay,count_10m_delay,stop_id,stop_name_short,stop_lat,stop_lon,stop_id_region
0,ABBASANTA,1.251249,0.0,29,1403,205.0,131.0,79.0,S12873,Abbasanta,40.128801,8.817733,20.0
1,ABBIATEGRASSO,2.978809,2.0,57,3736,1105.0,596.0,209.0,S01062,Abbiategrasso,45.400631,8.921305,1.0
2,ACQUAVIVA,3.013732,2.0,30,2052,721.0,327.0,61.0,S12209,Acquaviva,37.570258,13.674927,14.0
3,ACQUAVIVA DELLE FONTI,1.041216,1.0,36,2402,158.0,74.0,35.0,S11504,Acquaviva,40.892806,16.839826,16.0
4,ACQUEDOLCI-S.FRATELLO,0.272120,0.0,18,1203,93.0,57.0,22.0,S12026,Acquedolci S.F.,38.058459,14.587597,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,CANISTRO,4.800000,0.0,14,830,2.0,2.0,2.0,S08812,Canistro,,,
96,CANNETO SULL`OGLIO,6.860747,5.0,24,1987,1365.0,939.0,288.0,S01848,Canneto s/Oglio,45.150211,10.371470,1.0
97,CAPACI,9.750000,11.0,105,5461,7.0,6.0,5.0,S12145,Capaci,38.171794,13.232398,14.0
98,CARAVAGGIO,4.278503,2.0,43,2940,1087.0,709.0,262.0,S01602,Caravaggio,45.491158,9.637723,1.0


In [40]:
# save as csv
data_stop_pandas.to_csv(("data_stop.csv"), index=False)