In [177]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, countDistinct
from pyspark.sql import SparkSession

In [178]:
spark = SparkSession.builder \
    .master("local") \
    .appName("HistoricalRoadTracker") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

In [179]:
# load the data
df = spark.read.csv("all_roads.csv", header=True, inferSchema=True)

In [180]:
# historical 1
dfRoadCount = df.groupBy("plate").agg(countDistinct('road')).withColumnRenamed("count(road)", "road_count")

# get the top 100
dfRoadCount = dfRoadCount.orderBy(col("road_count").desc()).limit(100)
dfRoadCount.show()

+-----+----------+
|plate|road_count|
+-----+----------+
|VW258|         1|
|BW901|         1|
|FB448|         1|
|EK120|         1|
|TQ094|         1|
|IR383|         1|
|EI774|         1|
|VG478|         1|
|UX652|         1|
|JY143|         1|
|QG649|         1|
|VO509|         1|
|BO386|         1|
|KR572|         1|
|QX479|         1|
|SY844|         1|
|FM889|         1|
|GC683|         1|
|LW069|         1|
|FB553|         1|
+-----+----------+
only showing top 20 rows



In [181]:
# CALCULATE SPEED AND ACCELERATION

windowDept = Window.partitionBy("plate").orderBy(col("time").desc())

dfCalcs = df.withColumn("row",row_number().over(windowDept)) \
        .filter(col("row") <= 3)

# calculo da velocidade
dfCalcs = dfCalcs.withColumn("speed", F.col("x") - F.lag("x", 1).over(windowDept))

# make all values positive
dfCalcs = dfCalcs.withColumn("speed", F.abs(F.col("speed")))

# calculo da aceleracao
dfCalcs = dfCalcs.withColumn("acc", -F.col("speed") + F.lag("speed", 1).over(windowDept))

# drop null values and row column
dfCalcs = dfCalcs.na.drop()
dfCalcs = dfCalcs.drop("row")

dfCalcs.show()

+-----+----------+---------+---+---+-----+--------------------+---------+-----+---+
| road|road_speed|road_size|  x|  y|plate|                time|direction|speed|acc|
+-----+----------+---------+---+---+-----+--------------------+---------+-----+---+
|road0|       120|     1000|211|  1|AB574| 1.686406213437755E9|        1|    0|  0|
|road1|       120|     1000|240|  3|AC921|1.6864062615287898E9|        1|   38|-38|
|road2|       120|     1000|870|  5|AD953|1.6864062396293578E9|       -1|   66| -5|
|road1|       120|     1000|548|  9|AG129|1.6864062242716305E9|       -1|   93| -2|
|road4|       120|     1000|225|  1|AG604| 1.686406237883799E9|        1|    0|  0|
|road4|       120|     1000|487|  3|AQ481|1.6864062349618049E9|        1|   86|  1|
|road4|       120|     1000|232|  0|AQ887|1.6864062389233131E9|        1|   73| -3|
|road2|       120|     1000|608|  6|AS492|1.6864062400063767E9|       -1|    0|  0|
|road4|       120|     1000|858|  8|AS757|  1.68640622600074E9|       -1|   

In [182]:
# historical 2

dfStats = dfCalcs.groupBy("road").avg("speed", "road_size").withColumnRenamed("avg(speed)", "avg_speed").withColumnRenamed("avg(road_size)", "road_size")
dfStats = dfStats.withColumn("avg_time_to_cross", F.col( "road_size") / F.col("avg_speed")).drop("road_size")

# get rows where speed = 0 and acc = 0
dfCollisions = dfCalcs.filter((F.col("speed") == 0) & (F.col("acc") == 0))

# group by road and count
dfCollisions = dfCollisions.groupBy("road").count().withColumnRenamed("count", "total_collisions")

# join the dataframes
dfStats = dfStats.join(dfCollisions, "road", "left")

dfStats.collect()

[Row(road='road2', avg_speed=36.55263157894737, avg_time_to_cross=27.35781137508999, total_collisions=20),
 Row(road='road4', avg_speed=46.08108108108108, avg_time_to_cross=21.700879765395893, total_collisions=16),
 Row(road='road1', avg_speed=51.13157894736842, avg_time_to_cross=19.557385486361298, total_collisions=14),
 Row(road='road0', avg_speed=21.11764705882353, avg_time_to_cross=47.353760445682454, total_collisions=24),
 Row(road='road3', avg_speed=45.62162162162162, avg_time_to_cross=21.919431279620852, total_collisions=16)]

In [183]:
# historical 3

# calculate all speeds achieved by each car
windowDept = Window.partitionBy("plate").orderBy(col("time").desc())
dfSpeeds = df.withColumn("row",row_number().over(windowDept))

# calculo da velocidade
dfSpeeds = dfSpeeds.withColumn("speed", F.col("x") - F.lag("x", 1).over(windowDept))

# make all values positive
dfSpeeds = dfSpeeds.withColumn("speed", F.abs(F.col("speed")))

# check where speed is greater than 120 and the previous speed was less than 120 (that is, new infraction)
dfSpeeds = dfSpeeds.withColumn("change_in_speed",
                   F.when(((F.col("speed") > 120) & (F.lag("speed", 1).over(windowDept) <= 120)) , 1) \
                   .otherwise(0))

# drop null values and row column
dfSpeeds = dfSpeeds.na.drop()
dfSpeeds = dfSpeeds.drop("row")



dfSpeeds.show(200)

+-----+----------+---------+---+---+-----+--------------------+---------+-----+---------------+
| road|road_speed|road_size|  x|  y|plate|                time|direction|speed|change_in_speed|
+-----+----------+---------+---+---+-----+--------------------+---------+-----+---------------+
|road0|       120|     1000|211|  1|AB574|1.6864062285327337E9|        1|    0|              0|
|road0|       120|     1000|211|  1|AB574| 1.686406213437755E9|        1|    0|              0|
|road0|       120|     1000|211|  1|AB574| 1.686406197492601E9|        1|    0|              0|
|road0|       120|     1000|211|  1|AB574|1.6864061967385561E9|        1|    0|              0|
|road0|       120|     1000|211|  1|AB574| 1.686406187751596E9|        1|    0|              0|
|road0|       120|     1000|138|  1|AB574|1.6864061809904993E9|        1|   73|              0|
|road0|       120|     1000| 69|  1|AB574| 1.686406176030237E9|        1|   69|              0|
|road1|       120|     1000|278|  3|AC92