In [631]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, countDistinct
from pyspark.sql import SparkSession


In [632]:
spark = SparkSession.builder \
    .master("local") \
    .appName("HistoricalRoadTracker") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

In [633]:
# load the data
df = spark.read.csv("all_roads.csv", header=True, inferSchema=True)

In [634]:
# historical 1
dfRoadCount = df.groupBy("plate").agg(countDistinct('road')).withColumnRenamed("count(road)", "road_count")

# get the top 100
dfRoadCount = dfRoadCount.orderBy(col("road_count").desc()).limit(100)
dfRoadCount.show()

+-----+----------+
|plate|road_count|
+-----+----------+
|SF428|         1|
|TW315|         1|
|JL736|         1|
|EP171|         1|
|PB094|         1|
|MD954|         1|
|ZZ212|         1|
|ZP226|         1|
|UN926|         1|
|GF380|         1|
|DT049|         1|
|LI295|         1|
|GB081|         1|
|NP659|         1|
|BK572|         1|
|HI030|         1|
|VF070|         1|
|OG955|         1|
|UX361|         1|
|ZP642|         1|
+-----+----------+
only showing top 20 rows



In [635]:
# CALCULATE SPEED AND ACCELERATION

# calculate all speeds achieved by each car
windowDept = Window.partitionBy("plate").orderBy(col("time").desc())
dfCalcs = df.withColumn("row",row_number().over(windowDept))

# calc all speeds
dfCalcs = dfCalcs.withColumn("speed", F.col("x") - F.lag("x", -1).over(windowDept))

# make all values positive
dfCalcs = dfCalcs.withColumn("speed", F.abs(F.col("speed")))

# calc all accs
dfCalcs = dfCalcs.withColumn("acc", F.col("speed") - F.lag("speed", -1).over(windowDept))

# drop nulls and row column
dfCalcs = dfCalcs.na.drop()
dfCalcs = dfCalcs.drop("row")

dfCalcs.show()

+-----+----------+---------+---+---+-----+--------------------+---------+-----+---+
| road|road_speed|road_size|  x|  y|plate|                time|direction|speed|acc|
+-----+----------+---------+---+---+-----+--------------------+---------+-----+---+
|road3|       120|     1000|220|  2|AC272| 1.686422274283066E9|        1|    0|  0|
|road3|       120|     1000|220|  2|AC272|1.6864222536890728E9|        1|    0|  0|
|road3|       120|     1000|220|  2|AC272| 1.686422231388323E9|        1|    0|  0|
|road3|       120|     1000|220|  2|AC272|1.6864222095374138E9|        1|    0|  0|
|road3|       120|     1000|220|  2|AC272|  1.68642220938139E9|        1|    0|  0|
|road3|       120|     1000|220|  2|AC272|1.6864222092032158E9|        1|    0|-35|
|road3|       120|     1000|220|  2|AC272|1.6864222087273128E9|        1|   35|-26|
|road3|       120|     1000|185|  2|AC272|1.6864221860634181E9|        1|   61| -1|
|road3|       120|     1000|331|  9|AU218| 1.686422286230809E9|       -1|   

In [646]:
# historical 2

# get average speed per road
dfStats = dfCalcs.groupBy("road").avg("speed", "road_size")\
            .withColumnRenamed("avg(speed)", "avg_speed")\
            .withColumnRenamed("avg(road_size)", "road_size")

# calculate avg time to cross
dfStats = dfStats.withColumn("avg_time_to_cross", F.col( "road_size") / F.col("avg_speed")).drop("road_size")

# get rows where speed = 0 and acc = 0 (collisions)
dfCollisions = dfCalcs.filter((F.col("speed") == 0) & (F.col("acc") == 0))

# group by road and count
dfCollisions = dfCollisions.groupBy("road").count().withColumnRenamed("count", "total_collisions")

# join the dataframes to get all stats
dfStats = dfStats.join(dfCollisions, "road", "left")

dfStats.collect()

[Row(road='road2', avg_speed=35.96995708154506, avg_time_to_cross=27.8009784035318, total_collisions=96),
 Row(road='road4', avg_speed=53.53777777777778, avg_time_to_cross=18.678399468703304, total_collisions=61),
 Row(road='road1', avg_speed=42.77439024390244, avg_time_to_cross=23.378474697077692, total_collisions=59),
 Row(road='road0', avg_speed=32.415929203539825, avg_time_to_cross=30.849030849030846, total_collisions=87),
 Row(road='road3', avg_speed=60.224, avg_time_to_cross=16.604675876726887, total_collisions=30)]

In [648]:
# historical 3

# partition by plate and order by time (twice to have ascending and descending row numbers)
windowDept = Window.partitionBy("plate").orderBy(col("time").desc())
windowDept2 = Window.partitionBy("plate").orderBy(col("time").asc())

# create rows columns
dfCalcs = dfCalcs.withColumn("row",row_number().over(windowDept))
dfCalcs = dfCalcs.withColumn("row2",row_number().over(windowDept2))

# check where speed is greater than 120 and the previous speed was less than road_speed (that is, new infraction)
dfSpeeds = dfCalcs.withColumn("change_in_speed",
                   F.when(((F.col("speed") > F.col("road_speed")) & (F.lag("speed", -1).over(windowDept) <= F.lag("road_speed", -1).over(windowDept) )) , 1) \
                   .otherwise(0))

# check for vehicles that enter a road with speed > road_speed (infraction)
dfSpeeds = dfSpeeds.withColumn("change_in_speed",
                     F.when(((F.col("speed") > F.col("road_speed")) & (F.col("row2") ==1)), 1) \
                        .otherwise(F.col("change_in_speed")))

# chosen T (change it after testing)
t = 25000

# get all rows where now() - time < t
dfSpeeds = dfSpeeds.withColumn("past_time", F.unix_timestamp(F.current_timestamp()).cast("double"))
dfSpeeds = dfSpeeds.withColumn("diff_time", F.col("past_time") - F.col("time"))
dfSpeeds = dfSpeeds.filter(F.col("diff_time") < t)

#  check which cars have more than 10 infractions
dfInfractions = dfSpeeds.groupBy("plate").sum("change_in_speed").withColumnRenamed("sum(change_in_speed)", "total_infractions").filter(F.col("total_infractions") >= 10)

dfInfractions.show()

+-----+-----------------+
|plate|total_infractions|
+-----+-----------------+
+-----+-----------------+



In [638]:
# analise alternativa
