In [0]:
# --- 0. IMPORTS E SETUP ---
%load_ext autoreload
%autoreload 2

import sys
import os
import pickle
import pandas as pd
import numpy as np
import mlflow
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pyspark.sql.functions as F

# Adiciona o diret√≥rio raiz ao path para importar os m√≥dulos 'src'
sys.path.append(os.getcwd())

# Imports dos M√≥dulos do Projeto
from src.validation.config import Config
from src.validation.data import DataIngestion
from src.validation.pipeline import ProjectPipeline
from src.validation.trainer import ModelTrainer 
from src.deploy.wrapper import UnifiedForecaster

# Bibliotecas de Modelagem e MLflow
from darts.models import LightGBMModel
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient

# Configura√ß√µes de Otimiza√ß√£o do Spark (Delta Lake)
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")

In [0]:
# ==============================================================================
# 1. CONFIGURA√á√ÉO DO AMBIENTE E DATAS
# ==============================================================================
config = Config(spark)

# [IMPORTANTE] For√ßa o nome do experimento para ser o de DEPLOY
config.EXPERIMENT_NAME = "/Workspace/Shared/data_science/projetos/cvc_curva_de_vendas_por_canal/experiments/Model_Deploy_CVC_Loja"

# --- CORRE√á√ÉO AQUI ---
# Recuamos 2 meses para garantir que o horizonte de 35 dias esteja coberto pelos dados atuais (que param em 20/01)
data_final_deploy = pd.to_datetime(datetime.today().strftime("%Y-%m-01")) - relativedelta(months=2)

data_inicio_validacao = data_final_deploy - relativedelta(months=2)

config.TRAIN_END_DATE = data_final_deploy.strftime("%Y-%m-%d")
config.VAL_START_DATE = data_inicio_validacao.strftime("%Y-%m-%d")

# A ingest√£o continua pegando tudo o que tem dispon√≠vel
data_limite_ingestao = pd.to_datetime(datetime.today()) + relativedelta(days=90)
config.INGESTION_END = data_limite_ingestao.strftime("%Y-%m-%d")

print(f"üìÇ EXPERIMENTO ALVO: {config.EXPERIMENT_NAME}")
print(f"‚è±Ô∏è PER√çODO DE TREINO: {config.DATA_START} at√© {config.TRAIN_END_DATE}")
print(f"üì• PER√çODO DE INGEST√ÉO (Covari√°veis): At√© {config.INGESTION_END}")
print(f"üîç JANELA DE TESTE (BACKTEST): {config.VAL_START_DATE} at√© {config.TRAIN_END_DATE}")

In [0]:
print(config.DATA_START)
print(config.VAL_START_DATE)
print(config.TRAIN_END_DATE)
print(config.INGESTION_END)

In [0]:
# ==========================================================================
# 3. INGEST√ÉO E PREPARA√á√ÉO (VERS√ÉO LIMPA)
# ==========================================================================
import pandas as pd
from datetime import datetime

# --- 1. Ingest√£o e Filtros ---
ingestion = DataIngestion(spark, config)
df_spark_raw = ingestion.create_training_set()

# Filtra janela de interesse (Hist√≥rico + Futuro para Covari√°veis)
df_spark_filtered = df_spark_raw.filter(
    (df_spark_raw.data >= config.DATA_START) & (df_spark_raw.data <= config.INGESTION_END)
)
df_global_support = ingestion.get_global_support()[config.DATA_START : config.INGESTION_END]

print("‚è≥ Construindo objetos Darts...")
# Gera as listas iniciais (Covari√°veis assumidas como completas vindo da ingest√£o)
full_target_list_long, full_covariates_list = ingestion.build_darts_objects(df_spark_filtered, df_global_support)

# --- 2. Tratamento do Target (Corte Seguro) ---
# Removemos apenas o futuro do target (vendas), mantendo o hist√≥rico at√© hoje.
# A l√≥gica 'if/else' evita erros se a loja parou de vender antes de hoje.
cut_date = pd.Timestamp(datetime.today().date())
full_target_list = []

for ts in full_target_list_long:
    if ts.end_time() > cut_date:
        full_target_list.append(ts.drop_after(cut_date))
    else:
        full_target_list.append(ts)

# Debug Final
print(f"‚úÖ Processamento Conclu√≠do.")
if full_target_list:
    print(f"üîç [DEBUG] Target (amostra) termina em: {full_target_list[0].end_time()}")
if full_covariates_list:
    print(f"üîç [DEBUG] Covariates (amostra) terminam em: {full_covariates_list[0].end_time()}")

In [0]:
# ==============================================================================
# 2. IN√çCIO DA EXECU√á√ÉO MESTRA
# ==============================================================================
mlflow.set_experiment(config.EXPERIMENT_NAME)
with mlflow.start_run(run_name=f"Pipeline_Completo_{config.VERSION}") as parent_run:
    
    print(f"üîó Parent Run ID: {parent_run.info.run_id}")
    mlflow.log_param("pipeline_type", "auto_retrain_w_quality_gate")

    # ==========================================================================
    # 4. PREPARA√á√ÉO DO PIPELINE (SCALERS)
    # ==========================================================================
    pipeline = ProjectPipeline()
    val_cutoff_dt = pd.Timestamp(config.VAL_START_DATE) - pd.Timedelta(days=1)

    # Fit nos dados hist√≥ricos (evita Data Leakage)
    train_subset_for_fit = [s.drop_after(val_cutoff_dt) for s in full_target_list]
    cov_subset_for_fit = [c.drop_after(val_cutoff_dt) for c in full_covariates_list]

    print("‚öôÔ∏è Ajustando Scalers...")
    pipeline.fit(train_subset_for_fit, cov_subset_for_fit)

    # Transforma base completa
    scaled_series, scaled_covariates = pipeline.transform(full_target_list, full_covariates_list)
    
    train_series_static = [s.drop_after(val_cutoff_dt) for s in scaled_series]
    train_cov_static = [c.drop_after(val_cutoff_dt) for c in scaled_covariates]
    val_series_original = pipeline.inverse_transform(scaled_series, partial=True)

    # ==========================================================================
    # 5. VALIDA√á√ÉO AUTOM√ÅTICA (BACKTEST)
    # ==========================================================================
    model_params = {
        "lags": 12,
        "lags_future_covariates": [0,1,2,3],
        "output_chunk_length": 1,
        "random_state": 42
    }
    models_dict = {"model_deploy": LightGBMModel(**model_params)}

    print(f"\nüöÄ Executando Backtest (3 Meses)...")
    trainer = ModelTrainer(config, models_dict)
    trainer.train_evaluate_walkforward(
        train_series_static=train_series_static,
        train_covs_static=train_cov_static,
        full_series_scaled=scaled_series,
        full_covariates_scaled=scaled_covariates,
        val_series_original=val_series_original,
        target_pipeline=pipeline
    )
    
    # ==========================================================================
    # 6. GATEKEEPER (TRAVA DE SEGURAN√áA)
    # ==========================================================================
    print("\nüëÆ Verificando Qualidade do Modelo (Gatekeeper)...")
    
    # A. L√™ os resultados salvos na tabela Delta pela valida√ß√£o acima
    # Filtra apenas pela vers√£o atual do pipeline para n√£o pegar lixo antigo
    df_results = spark.table("ds_dev.cvc_val.resultado_metricas_treinamento_lojas") \
                      .filter(F.col("versao") == config.VERSION)
    
    # B. Descobre qual foi o √∫ltimo m√™s validado (o m√™s mais recente)
    try:
        last_month_str = df_results.select(F.max("metrica_mes")).collect()[0][0]
    except:
        raise Exception("‚ùå ERRO CR√çTICO: Tabela de resultados vazia. Valida√ß√£o falhou silenciosamente?")

    if not last_month_str:
        # Fallback se n√£o encontrar m√™s (pode acontecer se o backtest pular todos os meses)
        print("‚ö†Ô∏è Aviso: N√£o foi poss√≠vel identificar m√™s de valida√ß√£o. Tentando pegar o √∫ltimo dispon√≠vel.")
        last_month_str = config.TRAIN_END_DATE[:7] # YYYY-MM

    print(f"   üìÖ Analisando m√™s de corte: {last_month_str}")

    # C. Calcula o RMSE especificamente para este m√™s
    rmse_check = df_results.filter(F.col("metrica_mes") == last_month_str) \
                           .select(F.sqrt(F.mean(F.pow(F.col("real") - F.col("previsao"), 2))).alias("rmse")) \
                           .collect()[0]["rmse"]

    # Se der Nulo (sem dados), assume erro
    if rmse_check is None: rmse_check = 999999.0
    
    print(f"   üìâ RMSE Calculado: {rmse_check:.4f}")
    
    # Loga na Run Pai para auditoria
    mlflow.log_metric("gatekeeper_last_month_rmse", rmse_check)

    # D. APLICA A TRAVA
    LIMIT_RMSE = 1000000
    
    if rmse_check >= LIMIT_RMSE:
        error_msg = (f"‚õî BLOQUEIO DE DEPLOY: RMSE do √∫ltimo m√™s ({rmse_check:.2f}) "
                     f"excedeu o limite aceit√°vel ({LIMIT_RMSE}). Pipeline Abortado.")
        # Marca a run pai como FALHA explicitamente
        mlflow.set_tag("pipeline_status", "BLOCKED_BY_QUALITY")
        raise Exception(error_msg)
    
    print("‚úÖ Crit√©rio de Qualidade Aprovado! Prosseguindo para Deploy.")

    # ==========================================================================
    # 7. TREINAMENTO FINAL (FULL DATASET)
    # ==========================================================================
    print("\nüèãÔ∏è Iniciando Treinamento Final (Full Dataset)...")
    final_model = LightGBMModel(**model_params)
    final_model.fit(scaled_series, future_covariates=scaled_covariates)

    # ==========================================================================
    # 8. REGISTRO E DEPLOY
    # ==========================================================================

    catalog_model_name = f"{config.CATALOG}.cvc_pred.cvc_lojas_forecast_production"
    
    print(f"üíæ Registrando modelo: {catalog_model_name}")
    
    # Tags de rastreabilidade
    mlflow.set_tag("parent_run_id", parent_run.info.run_id)
    mlflow.set_tag("quality_check_rmse", f"{rmse_check:.2f}")

    # Prepara√ß√£o de Artefatos (Pipeline, Modelo, Metadados)
    sample_ts = full_target_list[0]
    sample_cov = full_covariates_list[0]
    
    training_metadata = {
        "static_cols_order": [c for c in sample_ts.static_covariates.columns.tolist() if c != "codigo_loja"],
        "covariate_cols_order": sample_cov.components.tolist(),
        "max_lag": 15
    }

    pipeline_path, model_path, cov_path, meta_path = "pipeline.pkl", "lgbm_model.pkl", "future_covariates.pkl", "model_metadata.pkl"
    
    with open(pipeline_path, "wb") as f: pickle.dump(pipeline, f)
    with open(model_path, "wb") as f: pickle.dump(final_model, f)
    with open(cov_path, "wb") as f: pickle.dump(scaled_covariates, f)
    with open(meta_path, "wb") as f: pickle.dump(training_metadata, f)
    
    artifacts = {"pipeline": pipeline_path, "darts_model": model_path, "future_covariates": cov_path, "metadata": meta_path}

    # Assinatura
    market_cols = [col for col in df_global_support.columns]
    full_input_dict = {
        **{col: [0.0] for col in market_cols},
        "data": ["2025-01-01"], "codigo_loja": ["1"], "target_vendas": [1000.0],
        "n": [35], "is_feriado": [0.0], "cluster_loja": ["A"], "sigla_uf": ["SP"], "tipo_loja": ["SHOPPING"], "modelo_loja": ["PADRAO"]
    }
    input_example = pd.DataFrame(full_input_dict)
    output_example = pd.DataFrame({"data_previsao": ["2025-01-02"], "previsao_venda": [1050.0], "codigo_loja": ["1"]})
    signature = infer_signature(input_example, output_example)

    # Log do Modelo
    model_info = mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=UnifiedForecaster(), 
        artifacts=artifacts,
        input_example=input_example,
        signature=signature,
        metadata={"description": "lightgbm"}, 
        registered_model_name=catalog_model_name
    )
    
    # Promo√ß√£o @Champion
    client = MlflowClient()
    mv = model_info.registered_model_version
    client.set_registered_model_alias(name=catalog_model_name, alias="Champion", version=mv)
    
    client.update_model_version(
        name=catalog_model_name, version=mv,
        description=f"Auto-Retrain. RMSE Check: {rmse_check:.2f}. Pipeline ID: {parent_run.info.run_id}"
    )

    for p in [pipeline_path, model_path, cov_path, meta_path]:
        if os.path.exists(p): os.remove(p)

print(f"\n‚ú® Pipeline Finalizado com Sucesso! RMSE Validado: {rmse_check:.2f}")