In [400]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number
from pyspark.sql import SparkSession

#### **Functions**

- `select()` - seleciona colunas
- `filter()` - filtra linhas

In [401]:
spark = SparkSession.builder \
    .master("local") \
    .appName("RoadTracker") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .getOrCreate()

In [402]:
# load the data
df = spark.read.csv("all_roads.csv", header=True, inferSchema=True)

In [403]:
# analise 1
n_roads = df.select("road").distinct().count()
print("Number of roads: {}".format(n_roads))

Number of roads: 5


In [404]:
# analise 2
n_cars = df.select("plate").distinct().count()
print("Number of cars: {}".format(n_cars))

Number of cars: 376


In [405]:
# CALCULATE SPEED AND ACCELERATION

windowDept = Window.partitionBy("plate").orderBy("time")

df = df.withColumn("row",row_number().over(windowDept)) \
        .filter(col("row") <= 3)

# calculo da velocidade
df = df.withColumn("speed", F.col("x") - F.lag("x", 1).over(windowDept))

# make all values positive
df = df.withColumn("speed", F.abs(F.col("speed")))

# calculo da aceleracao
df = df.withColumn("acc", F.col("speed") - F.lag("speed", 1).over(windowDept))

# drop null values
df = df.na.drop()

# drop row column
df = df.drop("row")

In [406]:
# analise 3

# add a column for the cars over the speed limit
df = df.withColumn("over_speed_limit", F.when(F.col("speed") > F.col("road_speed"), 1).otherwise(0))

cars_over_speed_limit = df.filter(F.col("over_speed_limit") == 1) \
    .select("plate").distinct().count()

print("Number of cars over the speed limit: {}".format(cars_over_speed_limit))

Number of cars over the speed limit: 25


In [407]:
# analise 4
windowDept = Window.partitionBy("road", "y").orderBy("x")

# calcula o risco de colisao fazendo posicao + (velocidade * direcao)

# essa versao funciona, mas adiciona o risco de colisao em carros diferentes dependendo de forward ou backward
#df = df.withColumn("collision_risk",
                #F.when((F.col("x") + (F.col("speed") * F.col("direction"))) > (F.lag("x", -1).over(windowDept) + (F.lag("speed", -1).over(windowDept) * F.lag("direction", -1).over(windowDept))), 1).otherwise(0))

# essa versao padroniza           
df = df.withColumn("collision_risk",
                   F.when(F.col("direction") == 1,
                          F.when((F.col("x") + F.col("speed") + F.col("acc")) > (F.lag("x", -1).over(windowDept) + F.lag("speed", -1).over(windowDept) + F.lag("acc").over(windowDept)), 1).otherwise(0)) \
                   .otherwise(F.when((F.col("x") - F.col("speed") - F.col("acc")) < (F.lag("x", 1).over(windowDept) - F.lag("speed", 1).over(windowDept) - F.lag("acc", 1).over(windowDept)), 1).otherwise(0)))

df.show()

+-----+----------+---+---+-----+--------------------+---------+-----+----+----------------+--------------+
| road|road_speed|  x|  y|plate|                time|direction|speed| acc|over_speed_limit|collision_risk|
+-----+----------+---+---+-----+--------------------+---------+-----+----+----------------+--------------+
|road0|       120|245|  0|PQ888|  1.68635273702827E9|        1|   81|  -1|               0|             0|
|road0|       120|304|  0|EZ328|1.6863529667917545E9|        1|   79|   2|               0|             0|
|road0|       120|329|  0|FR314| 1.686352775019318E9|        1|   97| -19|               0|             0|
|road0|       120|180|  1|IB083|1.6863527141156032E9|        1|    0| -90|               0|             0|
|road0|       120|180|  1|JI520| 1.686352690013728E9|        1|    0| -79|               0|             0|
|road0|       120|180|  1|JX514|1.6863529015628111E9|        1|   38| -33|               0|             1|
|road0|       120|180|  1|UC307|1.686

In [408]:
# analise 5
CollectionOverSpeedLimit = df.filter(F.col("over_speed_limit") == 1) \
                        .select("plate", "speed", "collision_risk") \
                        .collect()

CollectionOverSpeedLimit

[Row(plate='ET433', speed=184, collision_risk=0),
 Row(plate='HP247', speed=244, collision_risk=0),
 Row(plate='GZ591', speed=139, collision_risk=1),
 Row(plate='IU587', speed=129, collision_risk=0),
 Row(plate='ID323', speed=215, collision_risk=0),
 Row(plate='NM411', speed=122, collision_risk=0),
 Row(plate='LV494', speed=292, collision_risk=0),
 Row(plate='YC366', speed=123, collision_risk=1),
 Row(plate='TY900', speed=229, collision_risk=0),
 Row(plate='GM508', speed=222, collision_risk=0),
 Row(plate='NZ969', speed=163, collision_risk=1),
 Row(plate='UO112', speed=160, collision_risk=1),
 Row(plate='EA748', speed=186, collision_risk=0),
 Row(plate='CY559', speed=192, collision_risk=0),
 Row(plate='JZ803', speed=195, collision_risk=0),
 Row(plate='GR947', speed=149, collision_risk=1),
 Row(plate='JC218', speed=130, collision_risk=0),
 Row(plate='DR897', speed=194, collision_risk=0),
 Row(plate='YQ410', speed=121, collision_risk=0),
 Row(plate='BY889', speed=222, collision_risk=0),


In [409]:
# analise 6
CollectionCollisionRisk = df.filter(F.col("collision_risk") == 1) \
                .select("plate", "speed") \
                .collect()
                
CollectionCollisionRisk

[Row(plate='JX514', speed=38),
 Row(plate='AS798', speed=64),
 Row(plate='VE532', speed=80),
 Row(plate='ON434', speed=100),
 Row(plate='UO532', speed=62),
 Row(plate='XR890', speed=42),
 Row(plate='GZ591', speed=139),
 Row(plate='QD121', speed=87),
 Row(plate='DI502', speed=78),
 Row(plate='VD703', speed=104),
 Row(plate='RE570', speed=85),
 Row(plate='JS190', speed=7),
 Row(plate='JK057', speed=51),
 Row(plate='OM442', speed=117),
 Row(plate='YC366', speed=123),
 Row(plate='DL870', speed=0),
 Row(plate='ZU140', speed=51),
 Row(plate='BN401', speed=78),
 Row(plate='CH216', speed=0),
 Row(plate='MF916', speed=98),
 Row(plate='NZ969', speed=163),
 Row(plate='GR449', speed=108),
 Row(plate='UO112', speed=160),
 Row(plate='OC533', speed=51),
 Row(plate='CD052', speed=61),
 Row(plate='YS335', speed=96),
 Row(plate='KG211', speed=77),
 Row(plate='BW650', speed=71),
 Row(plate='HB781', speed=41),
 Row(plate='LQ019', speed=61),
 Row(plate='TL030', speed=93),
 Row(plate='MS485', speed=0),
 Row