In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, when, regexp_extract, upper
from delta.tables import DeltaTable

In [2]:
# Configurar acesso ao MinIO
minio_config = {
    "fs.s3a.access.key": "admin",
    "fs.s3a.secret.key": "senhasegura",
    "fs.s3a.endpoint": "http://minio:9000",
    "fs.s3a.path.style.access": "true",
    "fs.s3a.connection.ssl.enabled": "false",
    "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    "fs.s3a.connection.ssl.enabled": "false",
    "spark.hadoop.fs.s3a.impl.disable.cache": "true",
    "fs.spark.hadoop.fs.s3a.attempts.maximum":"5"
}

def create_spark_session() -> SparkSession:
    spark = (
        SparkSession.builder \
            .appName("GoldZone") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0,org.apache.hadoop:hadoop-aws:3.3.1") 
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 
            .config("spark.hadoop.fs.s3minio.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .getOrCreate()
    )
    spark.sparkContext.setLogLevel("WARN")
    return spark

spark = create_spark_session()

for key, value in minio_config.items():
    spark._jsc.hadoopConfiguration().set(key, value)

In [3]:
def transform_matches():
    print("Transformando tabela matches...")
    
    matches = spark.read.format("delta").options(**minio_config).load("s3a://bonze/soccer/match")
    
    silver_matches = matches.select(
        col("id").alias("match_id"),
        col("season"),
        to_date(col("date")).alias("match_date"),
        upper(col("home_team")).alias("home_team"),
        upper(col("away_team")).alias("away_team"),
        col("home_score").cast("integer"),
        col("away_score").cast("integer"),
        col("competition").alias("league"),
        regexp_extract(col("season"), r"(\d{4})", 1).cast("integer").alias("season_start_year"),
        when(col("neutral") == "TRUE", True).otherwise(False).alias("is_neutral"),
        col("attendance").cast("integer")
    ).dropDuplicates(["match_id"])
    
    silver_matches = silver_matches.fillna({
        "home_score": 0,
        "away_score": 0,
        "attendance": 0,
        "is_neutral": False
    })
    
    # Salvar como Delta particionado
    silver_matches.write.format("delta") \
        .options(**minio_config) \
        .partitionBy("season") \
        .mode("overwrite") \
        .save("s3a://silver/soccer/matches")
    
    print("Transformação de matches concluída!")

In [4]:
def transform_teams():
    print("Transformando tabela teams...")
    
    teams = spark.read.format("delta").options(**minio_config).load("s3a://bronze/soccer/player")
    
    silver_teams = teams.select(
        col("id").alias("team_id"),
        upper(col("name")).alias("team_name"),
        col("country").alias("team_country"),
        col("founded").cast("integer").alias("founded_year"),
        col("venue_name").alias("stadium"),
        col("venue_capacity").cast("integer")
    ).dropDuplicates(["team_id"])
    
    silver_teams.write.format("delta") \
        .options(**minio_config) \
        .mode("overwrite") \
        .save("s3a://silver/soccer/teams")
    
    print("Transformação de teams concluída!")


transform_matches()
transform_teams()


spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW silver_matches AS
SELECT * FROM delta.`s3a://silver/soccer/matches`
""")

spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW silver_teams AS
SELECT * FROM delta.`s3a://silver/soccer/teams`
""")

print("Tabelas na camada Silver:")
spark.sql("SHOW TABLES IN silver").show()


## Camada Silver
"""
Tabela: matches

| Coluna | Tipo | Descrição | Regras de Transformação |
|--------|------|-----------|-------------------------|
| match_id | BIGINT | ID único da partida | Renomeado de 'id' |
| season | STRING | Temporada no formato YYYY/YYYY | - |
| match_date | DATE | Data da partida | Convertido de string para date |
| home_team | STRING | Nome do time da casa | Convertido para maiúsculas |
| away_team | STRING | Nome do time visitante | Convertido para maiúsculas |
| home_score | INTEGER | Gols do time da casa | Convertido para inteiro, nulos = 0 |
| away_score | INTEGER | Gols do time visitante | Convertido para inteiro, nulos = 0 |
| league | STRING | Nome da competição | Renomeado de 'competition' |
| season_start_year | INTEGER | Ano de início da temporada | Extraído da coluna season |
| is_neutral | BOOLEAN | Indica se é campo neutro | Convertido de string para booleano |
| attendance | INTEGER | Público presente | Convertido para inteiro, nulos = 0 |

Tabela: teams

| Coluna | Tipo | Descrição | Regras de Transformação |
|--------|------|-----------|-------------------------|
| team_id | BIGINT | ID único do time | Renomeado de 'id' |
| team_name | STRING | Nome do time | Convertido para maiúsculas |
| team_country | STRING | País do time | - |
| founded_year | INTEGER | Ano de fundação | Convertido para inteiro |
| stadium | STRING | Nome do estádio | Renomeado de 'venue_name' |
| venue_capacity | INTEGER | Capacidade do estádio | Convertido para inteiro |
"""

Transformando tabela matches...


Py4JJavaError: An error occurred while calling o70.load.
: java.io.FileNotFoundException: Bucket bonze does not exist
	at org.apache.hadoop.fs.s3a.S3AFileSystem.verifyBucketExists(S3AFileSystem.java:393)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:322)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3469)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.sql.delta.DeltaTableUtils$.findDeltaTableRoot(DeltaTable.scala:186)
	at org.apache.spark.sql.delta.sources.DeltaDataSource$.parsePathIdentifier(DeltaDataSource.scala:316)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.x$1$lzycompute(DeltaTableV2.scala:70)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.x$1(DeltaTableV2.scala:65)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.timeTravelByPath$lzycompute(DeltaTableV2.scala:65)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.timeTravelByPath(DeltaTableV2.scala:65)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.$anonfun$timeTravelSpec$1(DeltaTableV2.scala:99)
	at scala.Option.orElse(Option.scala:447)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.timeTravelSpec$lzycompute(DeltaTableV2.scala:99)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.timeTravelSpec(DeltaTableV2.scala:95)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.snapshot$lzycompute(DeltaTableV2.scala:103)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.snapshot(DeltaTableV2.scala:102)
	at org.apache.spark.sql.delta.catalog.DeltaTableV2.toBaseRelation(DeltaTableV2.scala:168)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.$anonfun$createRelation$5(DeltaDataSource.scala:211)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:141)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:139)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.recordFrameProfile(DeltaDataSource.scala:51)
	at org.apache.spark.sql.delta.sources.DeltaDataSource.createRelation(DeltaDataSource.scala:169)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:350)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:274)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$1(DataFrameReader.scala:243)
	at scala.Option.map(Option.scala:230)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:188)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
