# Passo 1: IngestÃ£o e Tratamento (Spark)

## Objetivo
Baixar os dados, processar com Spark e salvar em tabela (Hive/Delta).

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, lit
from pyspark.sql.types import DoubleType, IntegerType
import kagglehub
import os
import glob

# Initialize Spark
spark = SparkSession.builder \
    .appName("RetailPriceOptimization") \
    .config("spark.sql.warehouse.dir", "spark-warehouse") \
    .getOrCreate()

print("Spark Session Created")

In [None]:
# 1. Download Data (Local)
print("Baixando dataset do Kaggle...")
path = kagglehub.dataset_download("suddharshan/retail-price-optimization")
print("Path:", path)

In [None]:
# 2. Load into Spark
csv_files = glob.glob(os.path.join(path, "**/*.csv"), recursive=True)
file_path = csv_files[0]
print(f"Reading: {file_path}")

df = spark.read.csv(file_path, header=True, inferSchema=True)

# Preview
df.show(5)

In [None]:
# 3. Transform / Clean
# Convert date (dd-MM-yyyy)
df = df.withColumn("date", to_date(col("month_year"), "dd-MM-yyyy"))

# Rename total_price -> total_revenue if needed
if "total_price" in df.columns:
    df = df.withColumnRenamed("total_price", "total_revenue")

# Ensure numeric types for metrics (inferSchema handles mostly, but good to be safe)
df = df.withColumn("unit_price", col("unit_price").cast("double")) \
       .withColumn("qty", col("qty").cast("int"))

# Sort by product and date (logical sort for viewing)
# df = df.orderBy("product_id", "date")

print("Schema:")
df.printSchema()

In [None]:
# 4. Save as Table
table_name = "retail_price_clean"
df.write.mode("overwrite").saveAsTable(table_name)

print(f"Tabela '{table_name}' salva com sucesso.")