In [1]:
pip install "flask<2.3,>=2.2" python-dotenv pyspark boto3

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("XP ETL Pipeline") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .getOrCreate()

from pyspark.sql.functions import col, from_unixtime, avg, count
from dotenv import load_dotenv

In [3]:
load_dotenv()

aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region = os.getenv("S3_REGION")

In [4]:
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", f"s3.{aws_region}.amazonaws.com")
spark._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

In [5]:
bronze_path = "s3a://xp-etl-pipeline/raw/kafka/ipca/postgres-postg_ipca/partition=0/"

try:
    df_bronze = spark.read.json(bronze_path)
    print("Bronze!")
    df_bronze.show()
    df_bronze.write.mode("overwrite").parquet("s3a://xp-etl-pipeline/processed-data/ipca/1 - bronze/")
except Exception as e:
    print(f"Error accessing S3: {e}")

Bronze!
+--------------------+--------------------+
|             payload|              schema|
+--------------------+--------------------+
|{6.22, 1181865600...|{[{CompraManha, N...|
|{6.28, 1181779200...|{[{CompraManha, N...|
|{6.25, 1181692800...|{[{CompraManha, N...|
|{6.15, 1181520000...|{[{CompraManha, N...|
|{6.19, 1181088000...|{[{CompraManha, N...|
|{6.45, 1184889600...|{[{CompraManha, N...|
|{6.38, 1184544000...|{[{CompraManha, N...|
|{6.38, 1184284800...|{[{CompraManha, N...|
|{6.45, 1184025600...|{[{CompraManha, N...|
|{6.45, 1183939200...|{[{CompraManha, N...|
|{6.51, 1183680000...|{[{CompraManha, N...|
|{6.35, 1183507200...|{[{CompraManha, N...|
|{6.32, 1183420800...|{[{CompraManha, N...|
|{6.32, 1183075200...|{[{CompraManha, N...|
|{6.33, 1182988800...|{[{CompraManha, N...|
|{6.41, 1182902400...|{[{CompraManha, N...|
|{6.47, 1182816000...|{[{CompraManha, N...|
|{6.28, 1182729600...|{[{CompraManha, N...|
|{6.19, 1182384000...|{[{CompraManha, N...|
|{6.13, 1182297600...|{[

In [6]:
df_bronze = df_bronze.select("payload.*")

df_silver = df_bronze.dropDuplicates()

df_silver = df_silver.withColumn("Data_Vencimento", from_unixtime(col("Data_Vencimento") / 1000, "yyyy-MM-dd")) \
                     .withColumn("Data_Base", from_unixtime(col("Data_Base") / 1000, "yyyy-MM-dd")) \
                     .withColumn("dt_update", from_unixtime(col("dt_update") / 1000, "yyyy-MM-dd HH:mm:ss"))

df_silver = df_silver.fillna({
    "PUCompraManha": 0,
    "PUVendaManha": 0,
    "PUBaseManha": 0
})

print("Silver!")
df_silver.show(truncate=False)

silver_path = "s3a://xp-etl-pipeline/processed-data/ipca/2 - silver/"
df_silver.write.mode("overwrite").parquet(silver_path)

Silver!
+-----------+----------+---------------+-----------+-------------+------------+----+----------+-------------------+
|CompraManha|Data_Base |Data_Vencimento|PUBaseManha|PUCompraManha|PUVendaManha|Tipo|VendaManha|dt_update          |
+-----------+----------+---------------+-----------+-------------+------------+----+----------+-------------------+
|6.38       |2007-07-16|2015-05-15     |1015.27    |1020.07      |1015.59     |IPCA|6.44      |2025-05-22 14:25:52|
|6.13       |2007-06-20|2015-05-15     |1026.94    |1031.81      |1027.22     |IPCA|6.19      |2025-05-22 14:25:52|
|6.32       |2007-06-29|2015-05-15     |1015.33    |1020.35      |1015.84     |IPCA|6.38      |2025-05-22 14:25:52|
|6.35       |2007-07-04|2015-05-15     |1014.27    |1019.11      |1014.61     |IPCA|6.41      |2025-05-22 14:25:52|
|6.87       |2010-05-12|2015-05-15     |1371.09    |1375.57      |1371.72     |IPCA|6.93      |2025-05-22 14:25:52|
|6.64       |2010-06-21|2015-05-15     |1404.34    |1407.39     

In [7]:
df_gold = df_silver.groupBy("Tipo").agg(
    avg("PUCompraManha").alias("Media_PUCompraManha"),
    avg("PUVendaManha").alias("Media_PUVendaManha"),
    count("*").alias("Total_Registros")
)

print("Gold!")
df_gold.show(truncate=False)

gold_path = "s3a://xp-etl-pipeline/processed-data/ipca/3 - gold/"
df_gold.write.mode("overwrite").parquet(gold_path)

Gold!
+----+-------------------+------------------+---------------+
|Tipo|Media_PUCompraManha|Media_PUVendaManha|Total_Registros|
+----+-------------------+------------------+---------------+
|IPCA|706.7272932330819  |701.4062218045111 |532            |
+----+-------------------+------------------+---------------+



In [8]:
spark.stop()