In [18]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, countDistinct
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
import pyspark
import time
import findspark
findspark.init()

In [19]:
def init_spark():        
    return pyspark.sql.SparkSession \
        .builder \
        .appName("RoadTracker") \
        .getOrCreate() \
        .read \
        .format("jdbc") \
        .option("url", "jdbc:redshift://roadtracker.cqgyzrqagvgs.us-east-1.redshift.amazonaws.com:5439/road-tracker?user=admin&password=roadTracker1") \
        .option("dbtable", "vasco") \
        .option("tempdir", "s3n://path/for/temp/data")

def init_mongo():
    #use local
    mongo_conn = "mongodb://127.0.0.1"
    conf = SparkConf().set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1")
    conf.set("spark.write.connection.uri", mongo_conn)
    conf.set("spark.mongodb.write.database", "roadtracker")
    conf.set("spark.mongodb.write.collection", "collisionRisk")
    
    sc = SparkContext.getOrCreate(conf=conf)

    return SparkSession(sc) \
        .builder \
        .appName("RoadTracker") \
        .getOrCreate()

In [20]:
spark = init_spark()

LastTimestamp = 0

# back in time (1 minute)
backInTime = 60

In [21]:
dfFull = spark.load()

DataFrame[road_id: string, speed_limit: int, road_size: int, x: int, y: int, plate: string, timestamp: string, direction: string]

In [22]:
# speed_limit as speed_limit 
dfFull = dfFull.withColumnRenamed("road_id", "road")

dfFull = dfFull.withColumn("time", F.col("timestamp").cast("float"))
dfFull = dfFull.withColumn("x", F.col("x").cast("int"))
dfFull = dfFull.withColumn("y", F.col("y").cast("smallint"))
dfFull = dfFull.withColumn("road_speed", F.col("speed_limit").cast("int"))
dfFull = dfFull.withColumn("direction", F.col("direction").cast("smallint"))
dfFull = dfFull.withColumn("road_size", F.col("road_size").cast("int"))

start_time = time.time()

# limit time to 1 minute before the last timestamp
LimitTime = LastTimestamp - backInTime
print(f'LimitTime: {LimitTime}')

# Filter the records until 1 minute before the last timestamp
dfNew = dfFull.filter(F.col("timestamp") > LimitTime)

print(f'Time to filter: {time.time() - start_time}')

# get distinct plates
# plates = dfNew.select("plate").distinct()

# # get the last 3 records of each car in plates from dfFull
# dfNewRoad = dfFull.join(plates, "plate", "inner")

windowDept = Window.partitionBy("plate")\
    .orderBy(col("time").desc())

print(f'Time to window: {time.time() - start_time}')

# get the last 3 records of each car
dfNew = dfNew.withColumn("row", row_number().over(windowDept)) \
    .filter(col("row") <= 3)

print(f'Time to filter: {time.time() - start_time}')

# print(f'Size of batch: {dfNew.count()}')

# ############################################
# --------------- BASE ANALYSIS --------------
# ############################################

# -----------------------
# VELOCIDADE E ACELERACAO

# calculo da velocidade
df = dfNew.withColumn("speed", F.col("x") - F.lag("x", -1).over(windowDept))
# make all values positive
df = df.withColumn("speed", F.abs(F.col("speed")))
# calculo da aceleracao
df = df.withColumn("acc", F.col("speed") - F.lag("speed", -1).over(windowDept))
# drop null values
df = df.na.drop()

print(f'Time to speed and acc: {time.time() - start_time}')
# -----------------------

# -----------------------
# DF DE RISCO DE COLISÃO
windowDept = Window.partitionBy("road", "y").orderBy("x")
# calcula o risco de colisao fazendo posicao + (velocidade * direcao) + (aceleracao * direcao)
df = df.withColumn("collision_risk",
                    F.when(F.col("direction") == 1,
                            F.when((F.col("x") + F.col("speed") + F.col("acc")) > (F.lag("x", -1).over(windowDept) + F.lag("speed", -1).over(windowDept) + F.lag("acc",-1).over(windowDept)), 1).otherwise(0)) \
                    .otherwise(F.when((F.col("x") - F.col("speed") - F.col("acc")) < (F.lag("x", 1).over(windowDept) - F.lag("speed", 1).over(windowDept) - F.lag("acc", 1).over(windowDept)), 1).otherwise(0)))
print(f'Time to collision risk: {time.time() - start_time}')
# -----------------------

# ----------------------- PRIORIDADE!
# ANALISE 6: LISTA DE VEICULOS COM RISCO DE COLISAO
# Placa e velocidade
CollisionRisk = df.filter(F.col("collision_risk") == 1) \
    .select("plate", "speed")
time_analysis6 = time.time() - start_time


print(f'Time to analysis 6: {time_analysis6}')
CollisionRisk.show()
# CollisionRisk.write.format("mongodb") \
#     .mode("overwrite") \
#     .option("database", "roadtracker") \
#     .option("collection", "analysis6") \
#     .save()
    
# -----------------------


LimitTime: -60
Time to filter: 0.009002685546875
Time to window: 0.02000284194946289
Time to filter: 0.03508496284484863
Time to speed and acc: 0.09600234031677246
Time to collision risk: 0.16900062561035156
Time to analysis 6: 0.19299912452697754


In [None]:
dfFull.count()

1250604