Transformações a serem realizadas

- [ ] Remoção de arquivos com horário de ping diferente do horário de requsição.
- [ ]  Ajustar hora em sp, diminuindo -3.
- [ ]  Em Curitiba, quando o campo “codigolinha” estiver “REC”, o ônibus não está em operação, logo, será removido.
- [ ] Ausência de valor no campo “linha” em BSB indica que não está em operação, logo deverá ser removido.
- [ ] Atualizar campos de horas e datas para ISO 8601  2024-02-24T13:05Z.
- [ ] Padronizar o sentido de operação da linha em SP e CWB para integers 1 = ida 2=  volta.
- [ ] Padronizar os identificadores de ônibus CUR_idOnibus.
- [ ] Add nome dos arquivos para um campo algo como "query_timestamp"


In [5]:
from pyspark.sql import *
from delta import *

builder = SparkSession.builder.appName("topicos").config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

INPUT_PATH = "/home/felipe/code/topicos_dados/dados/"
BRONZE_PATH = "/home/felipe/code/deltalake/lake/bronze/"
SILVER_PATH = "/home/felipe/code/deltalake/lake/silver/"

print(spark.version)


24/03/11 20:28:35 WARN Utils: Your hostname, desktop resolves to a loopback address: 127.0.1.1; using 192.168.0.106 instead (on interface enp6s0)
24/03/11 20:28:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/felipe/.ivy2/cache
The jars for the packages stored in: /home/felipe/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8c4de1a1-1fe3-48e9-b257-680e33928625;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/felipe/.local/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 155ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-8c4de1a1-1fe3-48e9-b257-680e33928625
	confs: [default]
	0 artifacts copied, 3 already retrieved (0kB/3ms)
24/03/11 20:28:36

3.5.0


In [8]:
# São Paulo

from pyspark.sql.functions import udf, input_file_name, col
from pyspark.sql.types import StringType
from datetime import datetime, timedelta
from pyspark.sql import functions as F

# Commmon Functions
def changeBusIdSP(onibus_id):
    """
    Change the column "id_onibus" to the following pattern: CITY_id_onibus

    Example: SP_0881
    """
    return f"SPO_{onibus_id}"

def changeTimestamp(timestamp):
    """ 
    Change timestamp to ISO 8601 (2024-02-24T13:05Z) using GMT-3
    """
    datetime_object = datetime.fromisoformat(timestamp)
    return (datetime_object-timedelta(hours=3)).isoformat()

def addTimestampQueryTime(filenames):
    """ 
    Add the column "query_timestamp" indicating the timestamp
    
    file:///home/felipe/code/topicos_dados/dados/cb_micro/1706665101.9679544.parquet -> 1706665101.9679544
    """
    file_name = str(filenames)[54:-8]
    time = (datetime.fromtimestamp(float(file_name))).strftime("%Y-%m-%dT%H:%M:%SZ")
    return time

def checkInvalidTimeRows(df):
    
    df = df.withColumn("tempo_captura", F.to_timestamp("tempo_captura"))
    df = df.withColumn("file_timestamp", F.to_timestamp("file_timestamp"))

    timediff = F.abs(F.unix_timestamp("tempo_captura") - F.unix_timestamp("file_timestamp"))

    return df.filter(timediff >= 300)


# UDFS

udf_transformBusIdSpo = udf(changeBusIdSP,StringType())
udf_changeTimestamp = udf(changeTimestamp,StringType())
udf_addTimestampFile = udf(addTimestampQueryTime,StringType())

# Reading DF
sp = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"{INPUT_PATH}/sp_micro").withColumn("inputFiles",input_file_name())

# Changing DF
sp = sp.withColumn("queried_at",udf_addTimestampFile(col("inputFiles")))
sp = sp.withColumn("bus_id",udf_transformBusIdSpo(col("id_onibus")))
sp = sp.withColumn("updated_at",udf_changeTimestamp(col("tempo_captura")))

# Droping columns
sp = sp.drop("inputFiles","id_onibus","tempo_captura")

sp.show()

 # TODO add checkinvalidtimerows


+-------------------+-------------------+-------+-------------------+--------------------+--------------------+---------+--------------------+
|           latitude|          longitude|      c|                lt0|                 lt1|            query_at|   bus_id|          updated_at|
+-------------------+-------------------+-------+-------------------+--------------------+--------------------+---------+--------------------+
|         -23.552871|-46.647738000000004|609F-10|TERM. PRINC. ISABEL|       CHÁC. SANTANA|2024-01-30T22:39:15Z|SPO_71212|2024-01-30T22:39:...|
|        -23.6532175| -46.74036099999999|609F-10|TERM. PRINC. ISABEL|       CHÁC. SANTANA|2024-01-30T22:39:15Z|SPO_71739|2024-01-30T22:38:...|
|        -23.6680755|         -46.748797|609F-10|TERM. PRINC. ISABEL|       CHÁC. SANTANA|2024-01-30T22:39:15Z|SPO_71361|2024-01-30T22:38:...|
|         -23.534974|        -46.6443435|609F-10|TERM. PRINC. ISABEL|       CHÁC. SANTANA|2024-01-30T22:39:15Z|SPO_71740|2024-01-30T22:38:...|

In [26]:
# Curitiba

from pyspark.sql.functions import udf, input_file_name, col
from pyspark.sql.types import StringType, IntegerType
from datetime import datetime, timedelta
from pyspark.sql import functions as F


def changeBusIdCWB(onibus_id):
    """
    Change the column "id_onibus" to the following pattern: CITY_id_onibus

    Example: SP_0881
    """
    return f"CWB_{onibus_id}"

def changeTimestampCwb(time,timestamp):
    """ 
    Change timestamp to ISO 8601 (2024-02-24T13:05Z) using GMT-3 for Curitiba

    This is the 'tempo_captura' field in Curitiba: "22:40"
    2024-01-30T
    """
    return f"{timestamp[:10]}{time}:00Z"

def addTimestampQueryTime(filenames):
    """ 
    Add the column "query_timestamp" indicating the timestamp
    
    file:///home/felipe/code/topicos_dados/dados/cb_micro/1706665101.9679544.parquet -> 1706665101.9679544
    """
    file_name = str(filenames)[54:-8]
    time = (datetime.fromtimestamp(float(file_name))).strftime("%Y-%m-%dT%H:%M:%SZ")
    return time

def checkInvalidTimeRows(df):
    
    df = df.withColumn("tempo_captura", F.to_timestamp("tempo_captura"))
    df = df.withColumn("file_timestamp", F.to_timestamp("file_timestamp"))

    timediff = F.abs(F.unix_timestamp("tempo_captura") - F.unix_timestamp("file_timestamp"))

    return df.filter(timediff >= 300)

def changeSentidoField(sentido):
    sentidoMap = {
        'IDA': 1,
        'VOLTA': 2
    }

    return sentidoMap[sentido] if sentido in list(sentidoMap.keys()) else 0

def removeInactiveBus(df):

    filtered_df = df.filter(df['linha']!="REC")
    return filtered_df



# UDFS

udf_transformBusIdCwb = udf(changeBusIdCWB,StringType())
udf_changeTimestamp = udf(changeTimestampCwb,StringType())
udf_addTimestampFile = udf(addTimestampQueryTime,StringType())
udf_changeSentido = udf(changeSentidoField,IntegerType())

# Reading DF

cwb = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"{INPUT_PATH}/cb_micro").withColumn("inputFiles",input_file_name())

# Transformation

cwb = cwb.withColumn("queried_at",udf_addTimestampFile(col("inputFiles")))
cwb = cwb.withColumn("bus_id",udf_transformBusIdCwb(col("id_onibus")))
cwb = cwb.withColumn("updated_at",udf_changeTimestamp(col("tempo_captura"),col('queried_at')))
cwb = cwb.withColumn("bus_direction",udf_changeSentido(col("sentido")))

cwb = removeInactiveBus(cwb)

# Dropping

cwb = cwb.drop("tempo_captura","sentido","inputFiles","id_onibus")

cwb.show()

# TODO add check invalid 


+----------+----------+--------------+-----+----------+---------------+------+--------+--------------------+---------+-------------------+-------------+
|  latitude| longitude|  tipo_veiculo|linha|  situacao|     situacao_2|tabela|adaptado|          queried_at|   bus_id|         updated_at|bus_direction|
+----------+----------+--------------+-----+----------+---------------+------+--------+--------------------+---------+-------------------+-------------+
|-25.453483| -49.28948|MICRO ESPECIAL|  762|NO HORÁRIO|REALIZANDO ROTA|   2-2|       1|2024-01-30T22:42:22Z|CWB_JI859|2024-01-3022:42:00Z|            2|
|-25.481223|-49.196893|MICRO ESPECIAL|  463|NO HORÁRIO|REALIZANDO ROTA|     2|       1|2024-01-30T22:42:22Z|CWB_DN603|2024-01-3022:38:00Z|            1|
|-25.432205|  -49.2658|MICRO ESPECIAL|  463|NO HORÁRIO|REALIZANDO ROTA|     4|       1|2024-01-30T22:42:22Z|CWB_DN608|2024-01-3022:42:00Z|            2|
|-25.514096|-49.322986|    ARTICULADO|  040|NO HORÁRIO|REALIZANDO ROTA|     5|    

In [27]:
# Brasilia

from pyspark.sql.functions import udf, input_file_name, col
from pyspark.sql.types import StringType
from datetime import datetime, timedelta
from pyspark.sql import functions as F


def changeBusIdBSB(onibus_id):
    """
    Change the column "id_onibus" to the following pattern: CITY_id_onibus

    Example: SP_0881
    """
    return f"BSB_{onibus_id}"

def changeTimestampBsb(time,timestamp):
    """ 
    Change timestamp to ISO 8601 (2024-02-24T13:05Z) using GMT-3 for Curitiba

    This is the 'tempo_captura' field in Curitiba: "1701355514000"
    But the value isn't the same as the timestamp from querying.
    """
    return f"{timestamp[:10]}{time}:00Z"

def addTimestampQueryTime(filenames):
    """ 
    Add the column "query_timestamp" indicating the timestamp
    
    file:///home/felipe/code/topicos_dados/dados/cb_micro/1706665101.9679544.parquet -> 1706665101.9679544
    """
    file_name = str(filenames)[54:-8]
    time = (datetime.fromtimestamp(float(file_name))).strftime("%Y-%m-%dT%H:%M:%SZ")
    return time

def checkInvalidTimeRows(df):
    
    df = df.withColumn("tempo_captura", F.to_timestamp("tempo_captura"))
    df = df.withColumn("file_timestamp", F.to_timestamp("file_timestamp"))

    timediff = F.abs(F.unix_timestamp("tempo_captura") - F.unix_timestamp("file_timestamp"))

    return df.filter(timediff >= 300)

def changeSentidoField(sentido):
    sentidoMap = {
        'IDA': 1,
        'VOLTA': 2
    }

    return sentidoMap[sentido] if sentido in list(sentidoMap.keys()) else 0

def removeInactiveBus(df):

    filtered_df = df.filter(df['linha']!="")
    return filtered_df

# UDFS

udf_transformBusIdBSB = udf(changeBusIdBSB,StringType())
udf_changeTimestamp = udf(changeTimestampBsb,StringType())
udf_addTimestampFile = udf(addTimestampQueryTime,StringType())
udf_changeSentido = udf(changeSentidoField,IntegerType())

bsb = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"{INPUT_PATH}/df_micro").withColumn("inputFiles",input_file_name())

bsb = bsb.withColumn("queried_at",udf_addTimestampFile(col("inputFiles")))
bsb = bsb.withColumn("updated_at",udf_changeTimestamp(col("tempo_captura"),col("queried_at")))
bsb = bsb.withColumn("bus_id",udf_transformBusIdBSB(col("id_onibus")))
bsb = bsb.withColumn("bus_direction",udf_changeSentido(col("sentido")))

bsb = removeInactiveBus(bsb)

# dropping 

bsb = bsb.drop("tempo_captura","sentido","inputFiles","id_onibus")

bsb.show()


+---------+---------+----------+----------+-----+--------------------+--------------------+----------+-------------+
|longitude| latitude|velocidade|   direcao|linha|          queried_at|          updated_at|    bus_id|bus_direction|
+---------+---------+----------+----------+-----+--------------------+--------------------+----------+-------------+
|-47.90188|-15.78581|     16.67|148.414433|0.882|2024-01-30T23:20:29Z|2024-01-301706665...|BSB_336921|            2|
|-47.95254| -15.8094|     18.61| 166.02643|0.813|2024-01-30T23:20:29Z|2024-01-301706667...|BSB_336408|            2|
|-47.96208|-15.86806|      6.67|200.556045|084.1|2024-01-30T23:20:29Z|2024-01-301706667...|BSB_336971|            1|
|-48.14928|-15.89348|     13.89|100.479616|0.373|2024-01-30T23:20:29Z|2024-01-301706667...|BSB_340065|            2|
|-47.89276| -15.8111|      6.94| 22.280791|0.373|2024-01-30T23:20:29Z|2024-01-301706667...|BSB_340103|            1|
|-48.04888|-15.84341|       7.5|208.338534|0.805|2024-01-30T23:2

In [32]:
# Rio de Janeiro

from pyspark.sql.functions import udf, input_file_name, col
from pyspark.sql.types import StringType
from datetime import datetime, timedelta
from pyspark.sql import functions as F


def changeBusIdRj(onibus_id):
    """
    Change the column "id_onibus" to the following pattern: CITY_id_onibus

    Example: SP_0881
    """
    return f"RJO_{onibus_id}"

def changeTimestampRj(timestamp):
    """ 
    Change timestamp to ISO 8601 (2024-02-24T13:05Z) using GMT-3 for Rio de janeiro

    This is the 'tempo_captura' field in Rio de Janeiro: "1701355514000"
    
    """
    time = (datetime.fromtimestamp(float(timestamp[:-3]))).strftime("%Y-%m-%dT%H:%M:%SZ")
    return time

def addTimestampQueryTime(filenames):
    """ 
    Add the column "query_timestamp" indicating the timestamp
    
    file:///home/felipe/code/topicos_dados/dados/cb_micro/1706665101.9679544.parquet -> 1706665101.9679544
    """
    file_name = str(filenames)[54:-8]
    time = (datetime.fromtimestamp(float(file_name))).strftime("%Y-%m-%dT%H:%M:%SZ")
    return time

def checkInvalidTimeRows(df):
    
    df = df.withColumn("tempo_captura", F.to_timestamp("tempo_captura"))
    df = df.withColumn("file_timestamp", F.to_timestamp("file_timestamp"))

    timediff = F.abs(F.unix_timestamp("tempo_captura") - F.unix_timestamp("file_timestamp"))

    return df.filter(timediff >= 300)


# UDFS

udf_transformBusIdRj = udf(changeBusIdRj,StringType())
udf_changeTimestamp = udf(changeTimestampRj,StringType())
udf_addTimestampFile = udf(addTimestampQueryTime,StringType())


rj = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"{INPUT_PATH}/rj_micro").withColumn("inputFiles",input_file_name())

rj = rj.withColumn("queried_at",udf_addTimestampFile(col("inputFiles")))
rj = rj.withColumn("updated_at",udf_changeTimestamp(col("tempo_captura")))
rj = rj.withColumn("bus_id",udf_transformBusIdRj(col("id_onibus")))

# dropping

rj = rj.drop("tempo_captura","id_onibus","inputFiles")

rj.show()


+---------+---------+----------+-----+--------------------+--------------------+----------+
| latitude|longitude|velocidade|linha|          queried_at|          updated_at|    bus_id|
+---------+---------+----------+-----+--------------------+--------------------+----------+
|-22,91606|-43,23063|        37|  711|2024-01-31T22:43:00Z|2024-01-31T22:41:46Z|RJO_A29134|
|-22,97236|-43,18777|        35|  473|2024-01-31T22:43:00Z|2024-01-31T22:41:49Z|RJO_A29108|
|-22,89427|-43,19073|        38|  350|2024-01-31T22:43:00Z|2024-01-31T22:41:45Z|RJO_A29185|
|-22,84925|-43,33784|        29|  711|2024-01-31T22:43:00Z|2024-01-31T22:41:48Z|RJO_A29034|
|-22,89272|-43,21592|        37|  209|2024-01-31T22:43:00Z|2024-01-31T22:41:45Z|RJO_A29182|
|-22,91143|-43,21638|        25|  457|2024-01-31T22:43:00Z|2024-01-31T22:41:50Z|RJO_A29192|
|-22,89258|-43,23858|         9|  472|2024-01-31T22:43:00Z|2024-01-31T22:41:49Z|RJO_A29018|
|-22,84079|-43,30468|        27|  350|2024-01-31T22:43:00Z|2024-01-31T22:41:51Z|