# Passo 6: Treinamento Final e Deploy

## Objetivo
Treinar o modelo vencedor (definido na etapa de experimentação) com **todos os dados** disponíveis e preparar para deploy.

**Steps:**
1. Setup e Carga Total dos Dados
2. Treinamento com Hiperparâmetros Otimizados
3. Registro do Modelo Final no MLflow e Salvamento no DBFS

In [None]:
import mlflow
import mlflow.spark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline

spark = SparkSession.builder \
    .appName("RetailPriceFeatures_Deploy") \
    .config("spark.sql.warehouse.dir", "spark-warehouse") \
    .getOrCreate()

# Load FULL Data
try:
    df_full = spark.table("retail_price_clean")
except:
    import pandas as pd
    df_full = spark.createDataFrame(pd.read_parquet('../data/retail_price_clean.parquet'))

print(f"Total Data for Training: {df_full.count()}")

In [None]:
# --- FEATURE ENGINEERING REPLICATION ---
# Must be identical to modeling phase

window_spec = Window.partitionBy("product_id").orderBy("date")
df_proc = df_full.withColumn("lag_qty_1", F.lag("qty", 1).over(window_spec)) \
                 .withColumn("lag_price_1", F.lag("unit_price", 1).over(window_spec))

df_proc = df_proc.na.fill(0)

df_proc = df_proc.withColumn("price_diff_comp1", F.col("unit_price") - F.col("comp_1")) \
           .withColumn("month", F.month("date"))

# Stages
indexer_prod = StringIndexer(inputCol="product_id", outputCol="product_id_idx", handleInvalid="keep")
indexer_cat = StringIndexer(inputCol="product_category_name", outputCol="product_category_idx", handleInvalid="keep")
encoder_cat = OneHotEncoder(inputCols=["product_category_idx"], outputCols=["product_category_vec"])

# Features List
num_features = [
    'freight_price', 'unit_price', 'product_name_lenght', 'product_description_lenght',
    'product_photos_qty', 'product_weight_g', 'product_score', 'customers',
    'weekday', 'weekend', 'holiday', 'month', 'year', 'volume',
    'comp_1', 'ps1', 'fp1', 'comp_2', 'ps2', 'fp2', 
    'comp_3', 'ps3', 'fp3', 'lag_price', 
    'lag_qty_1', 'lag_price_1', 'price_diff_comp1'
]

assembler = VectorAssembler(
    inputCols=num_features + ["product_id_idx", "product_category_vec"],
    outputCol="features",
    handleInvalid="keep"
)

In [None]:
# --- FINAL TRAINING ---

# Configurar aqui os melhores parametros encontrados no notebook 04
# Exemplo: GBT com MaxDepth 5, MaxIter 50

final_model_estimator = GBTRegressor(
    featuresCol="features", 
    labelCol="qty", 
    maxDepth=5, 
    maxIter=50, 
    seed=42
)

deploy_pipeline = Pipeline(stages=[indexer_prod, indexer_cat, encoder_cat, assembler, final_model_estimator])

experiment_deploy = "/Shared/RetailPrice_Deploy"
mlflow.set_experiment(experiment_deploy)

with mlflow.start_run(run_name="Deploy_Full_Data") as run:
    print("Training Final Model on Full Dataset...")
    final_pipeline_model = deploy_pipeline.fit(df_proc)
    print("Model Trained.")
    
    # Register in MLflow
    mlflow.spark.log_model(final_pipeline_model, "model", registered_model_name="RetailPrice_Predictor_Prod")
    
    # Save to FileSystem (DBFS/S3/Local) for usage in app.py or optimization jobs
    model_save_path = "../models/spark_model_production"
    final_pipeline_model.write().overwrite().save(model_save_path)
    print(f"Model saved locally to {model_save_path}")
    
    # Log Artifact Path
    mlflow.log_param("model_location", model_save_path)