In [13]:
from pyspark.sql import *
from delta import *
from pyspark.sql.functions import input_file_name

builder = SparkSession.builder.appName("topicos").config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

INPUT_PATH = "/home/felipe/code/topicos_dados/dados/"
BRONZE_PATH = "/home/felipe/code/deltalake/lake/bronze/"
SILVER_PATH = "/home/felipe/code/deltalake/lake/silver/"

def teste():
    return input_file_name()

sp = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"{INPUT_PATH}/sp_micro").withColumn("inputFiles",input_file_name())
sp.show(50)
sp.columns


+-------------------+-------------------+--------------------+---------+-------+-------------------+--------------------+--------------------+
|           latitude|          longitude|       tempo_captura|id_onibus|      c|                lt0|                 lt1|          inputFiles|
+-------------------+-------------------+--------------------+---------+-------+-------------------+--------------------+--------------------+
|         -23.552871|-46.647738000000004|2024-01-31T01:39:09Z|    71212|609F-10|TERM. PRINC. ISABEL|       CHÁC. SANTANA|file:///home/feli...|
|        -23.6532175| -46.74036099999999|2024-01-31T01:38:46Z|    71739|609F-10|TERM. PRINC. ISABEL|       CHÁC. SANTANA|file:///home/feli...|
|        -23.6680755|         -46.748797|2024-01-31T01:38:37Z|    71361|609F-10|TERM. PRINC. ISABEL|       CHÁC. SANTANA|file:///home/feli...|
|         -23.534974|        -46.6443435|2024-01-31T01:38:55Z|    71740|609F-10|TERM. PRINC. ISABEL|       CHÁC. SANTANA|file:///home/feli...|

['latitude',
 'longitude',
 'tempo_captura',
 'id_onibus',
 'c',
 'lt0',
 'lt1',
 'inputFiles']

In [12]:
sp = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"{INPUT_PATH}/sp_micro").withColumn("inputFiles",input_file_name())
sp.select("inputFiles").show(5)



+--------------------+
|          inputFiles|
+--------------------+
|file:///home/feli...|
|file:///home/feli...|
|file:///home/feli...|
|file:///home/feli...|
|file:///home/feli...|
+--------------------+
only showing top 5 rows



In [6]:
# CWB 

from pyspark.sql import *
from delta import *
from pyspark.sql.functions import input_file_name

builder = SparkSession.builder.appName("topicos").config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

INPUT_PATH = "/home/felipe/code/topicos_dados/dados/"
BRONZE_PATH = "/home/felipe/code/deltalake/lake/bronze/"
SILVER_PATH = "/home/felipe/code/deltalake/lake/silver/"

def teste():
    return input_file_name()

sp = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"/home/felipe/code/topicos_dados/dados/df_micro/1706937873.114361.parquet").withColumn("inputFiles",input_file_name())
sp = sp.sort(sp["tempo_captura"],ascending=False)
sp.show(50)
sp.columns


+----------+----------+-------------+---------+----------+-------+----------+-----+--------------------+
| longitude|  latitude|tempo_captura|id_onibus|velocidade|sentido|   direcao|linha|          inputFiles|
+----------+----------+-------------+---------+----------+-------+----------+-----+--------------------+
| -47.64034| -15.61314|1706937874000|   117315|      NULL|    IDA|165.963756|     |file:///home/feli...|
|-48.031384|-16.022823|1706937862000|   228885|      NULL|   NULL|       0.0|     |file:///home/feli...|
|-48.032143|-16.023239|1706937860000|   232297|      NULL|   NULL|       0.0|     |file:///home/feli...|
|  -48.0327|-16.024405|1706937860000|   233030|      NULL|   NULL|       0.0|     |file:///home/feli...|
|-47.765854|-15.737126|1706937860000|   226947|      NULL|   NULL|       0.0|     |file:///home/feli...|
|-48.056816|-16.006245|1706937860000|   227897|      NULL|   NULL|       0.0|     |file:///home/feli...|
|-48.056725|-16.005779|1706937860000|   232271|      NU

['longitude',
 'latitude',
 'tempo_captura',
 'id_onibus',
 'velocidade',
 'sentido',
 'direcao',
 'linha',
 'inputFiles']

In [11]:
# CWB 

from pyspark.sql import *
from delta import *
from pyspark.sql.functions import input_file_name, udf, col
from pyspark.sql.types import StringType
from datetime import datetime, timedelta

builder = SparkSession.builder.appName("topicos").config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

INPUT_PATH = "/home/felipe/code/topicos_dados/dados/"
BRONZE_PATH = "/home/felipe/code/deltalake/lake/bronze/"
SILVER_PATH = "/home/felipe/code/deltalake/lake/silver/"


def addTimestampQueryTime(filenames):
    """ 
    Add the column "query_timestamp" indicating the timestamp
    
    file:///home/felipe/code/topicos_dados/dados/cb_micro/1706665101.9679544.parquet -> 1706665101.9679544
    """
    file_name = str(filenames)[54:-8]
    time = (datetime.fromtimestamp(float(file_name))).strftime("%Y-%m-%dT%H:%M:%SZ")
    return time

udf_teste = udf(addTimestampQueryTime,StringType())

sp = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"{INPUT_PATH}/cb_micro").withColumn("inputFiles",input_file_name())
sp = sp.withColumn("teste",udf_teste(col("inputFiles")))
sp.show(20)
sp.schema



+---------+----------+----------+-------------+--------------+-----+----------+---------------+--------+------+--------+--------------------+--------------------+
|id_onibus|  latitude| longitude|tempo_captura|  tipo_veiculo|linha|  situacao|     situacao_2| sentido|tabela|adaptado|          inputFiles|               teste|
+---------+----------+----------+-------------+--------------+-----+----------+---------------+--------+------+--------+--------------------+--------------------+
|    JI859|-25.453483| -49.28948|        22:42|MICRO ESPECIAL|  762|NO HORÁRIO|REALIZANDO ROTA|   VOLTA|   2-2|       1|file:///home/feli...|2024-01-30T22:42:22Z|
|    DN603|-25.481223|-49.196893|        22:38|MICRO ESPECIAL|  463|NO HORÁRIO|REALIZANDO ROTA|     IDA|     2|       1|file:///home/feli...|2024-01-30T22:42:22Z|
|    DN608|-25.432205|  -49.2658|        22:42|MICRO ESPECIAL|  463|NO HORÁRIO|REALIZANDO ROTA|   VOLTA|     4|       1|file:///home/feli...|2024-01-30T22:42:22Z|
|    GB611|-25.514096|

StructType([StructField('id_onibus', StringType(), True), StructField('latitude', StringType(), True), StructField('longitude', StringType(), True), StructField('tempo_captura', StringType(), True), StructField('tipo_veiculo', StringType(), True), StructField('linha', StringType(), True), StructField('situacao', StringType(), True), StructField('situacao_2', StringType(), True), StructField('sentido', StringType(), True), StructField('tabela', StringType(), True), StructField('adaptado', StringType(), True), StructField('inputFiles', StringType(), False), StructField('teste', StringType(), True)])

In [3]:
# BSB

from pyspark.sql import *
from delta import *
from pyspark.sql.functions import input_file_name, udf, col
from pyspark.sql.types import StringType
from datetime import datetime, timedelta

builder = SparkSession.builder.appName("topicos").config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

INPUT_PATH = "/home/felipe/code/topicos_dados/dados/"
BRONZE_PATH = "/home/felipe/code/deltalake/lake/bronze/"
SILVER_PATH = "/home/felipe/code/deltalake/lake/silver/"


# def addTimestampQueryTime(filenames):
#     """ 
#     Add the column "query_timestamp" indicating the timestamp
    
#     file:///home/felipe/code/topicos_dados/dados/cb_micro/1706665101.9679544.parquet -> 1706665101.9679544
#     """
#     file_name = str(filenames)[54:-8]
#     time = (datetime.fromtimestamp(float(file_name))).strftime("%Y-%m-%dT%H:%M:%SZ")
#     return time

udf_teste = udf(addTimestampQueryTime,StringType())

sp = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"/home/felipe/coleta2/df/1707879540.8172524.parquet").withColumn("inputFiles",input_file_name())
# sp = sp.withColumn("teste",udf_teste(col("inputFiles")))
sp.show(20)
sp.schema



+---------+---------+-------------+---------+----------+-------+----------+-----+--------------------+
| latitude|longitude|tempo_captura|id_onibus|velocidade|sentido|   direcao|linha|          inputFiles|
+---------+---------+-------------+---------+----------+-------+----------+-----+--------------------+
|-47.90738|-15.73597|1707405026000|   340057|       0.0|  VOLTA|5.71059313|0.054|file:///home/feli...|
|-48.12336|-15.89946|1707535032000|   335681|       0.0|  VOLTA|104.036243|     |file:///home/feli...|
|-48.02328|-15.87568|1707605115000|   340014|       0.0|  VOLTA| 241.38954|     |file:///home/feli...|
|-48.10841|-15.92123|1707740529000|   340260|       0.0|  VOLTA|326.915147|     |file:///home/feli...|
| -48.1233| -15.8995|1707752939000|   336751|       0.0|    IDA|63.4349488|     |file:///home/feli...|
|-48.12349| -15.9002|1707803452000|   336726|       0.0|    IDA|318.814074|     |file:///home/feli...|
|-48.10838|-15.92085|1707831623000|   337625|       0.0|  VOLTA|57.449996

StructType([StructField('latitude', DoubleType(), True), StructField('longitude', DoubleType(), True), StructField('tempo_captura', LongType(), True), StructField('id_onibus', StringType(), True), StructField('velocidade', DoubleType(), True), StructField('sentido', StringType(), True), StructField('direcao', DoubleType(), True), StructField('linha', StringType(), True), StructField('inputFiles', StringType(), False)])

In [16]:
# Rio

from pyspark.sql import *
from delta import *
from pyspark.sql.functions import input_file_name, udf, col
from pyspark.sql.types import StringType
from datetime import datetime, timedelta

builder = SparkSession.builder.appName("topicos").config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension").config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = configure_spark_with_delta_pip(builder).getOrCreate()

INPUT_PATH = "/home/felipe/code/topicos_dados/dados/"
BRONZE_PATH = "/home/felipe/code/deltalake/lake/bronze/"
SILVER_PATH = "/home/felipe/code/deltalake/lake/silver/"


def addTimestampQueryTime(filenames):
    """ 
    Add the column "query_timestamp" indicating the timestamp
    
    file:///home/felipe/code/topicos_dados/dados/cb_micro/1706665101.9679544.parquet -> 1706665101.9679544
    """
    file_name = str(filenames)[54:-8]
    time = (datetime.fromtimestamp(float(file_name))).strftime("%Y-%m-%dT%H:%M:%SZ")
    return time

udf_teste = udf(addTimestampQueryTime,StringType())

sp = spark.read.format("parquet").option("inferSchema","true").option("header","true").load(f"{INPUT_PATH}/rj_micro").withColumn("inputFiles",input_file_name())
sp = sp.withColumn("teste",udf_teste(col("inputFiles")))
sp.show(20)
sp.schema



+---------+---------+-------------+---------+----------+-----+--------------------+--------------------+
| latitude|longitude|tempo_captura|id_onibus|velocidade|linha|          inputFiles|               teste|
+---------+---------+-------------+---------+----------+-----+--------------------+--------------------+
|-22,91606|-43,23063|1706751706000|   A29134|        37|  711|file:///home/feli...|2024-01-31T22:43:00Z|
|-22,97236|-43,18777|1706751709000|   A29108|        35|  473|file:///home/feli...|2024-01-31T22:43:00Z|
|-22,89427|-43,19073|1706751705000|   A29185|        38|  350|file:///home/feli...|2024-01-31T22:43:00Z|
|-22,84925|-43,33784|1706751708000|   A29034|        29|  711|file:///home/feli...|2024-01-31T22:43:00Z|
|-22,89272|-43,21592|1706751705000|   A29182|        37|  209|file:///home/feli...|2024-01-31T22:43:00Z|
|-22,91143|-43,21638|1706751710000|   A29192|        25|  457|file:///home/feli...|2024-01-31T22:43:00Z|
|-22,89258|-43,23858|1706751709000|   A29018|         9

StructType([StructField('latitude', StringType(), True), StructField('longitude', StringType(), True), StructField('tempo_captura', StringType(), True), StructField('id_onibus', StringType(), True), StructField('velocidade', StringType(), True), StructField('linha', StringType(), True), StructField('inputFiles', StringType(), False), StructField('teste', StringType(), True)])