In [278]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import os, json, joblib
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL
from pathlib import Path
import unicodedata as ud
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

# Tratamento de dados

### Conectando com o Banco de Dados

In [66]:
load_dotenv()

HOST = os.getenv("PGHOST")
PORT = os.getenv("PGPORT")
DB   = os.getenv("PGDATABASE")
USR  = os.getenv("PGUSER")
PWD  = os.getenv("PGPASSWORD")

In [68]:
# 1) Caminho .env local
ENV_PATH = Path(r"C:\Users\cicer\Documents\Case Técnico Paipe\Análise Exploratória\.env")
print("Arquivo .env existe?", ENV_PATH.exists(), "\nCaminho:", ENV_PATH)

# 2) Carrega o .env
load_dotenv(dotenv_path=ENV_PATH, override=True)

# 3) Confere o que foi lido
cfg = {k: os.getenv(k) for k in ["PGHOST", "PGPORT", "PGDATABASE", "PGUSER"]}
print(cfg, "| PGPASSWORD set?", bool(os.getenv("PGPASSWORD")))


Arquivo .env existe? True 
Caminho: C:\Users\cicer\Documents\Case Técnico Paipe\Análise Exploratória\.env
{'PGHOST': 'localhost', 'PGPORT': '5432', 'PGDATABASE': 'PaipeTech', 'PGUSER': 'postgres'} | PGPASSWORD set? True


In [70]:
HOST = os.getenv("PGHOST", "localhost")
PORT = int(os.getenv("PGPORT") or 5432)
DB   = os.getenv("PGDATABASE")
USR  = os.getenv("PGUSER")
PWD  = os.getenv("PGPASSWORD")

url = URL.create(
    "postgresql+psycopg2",
    username=USR,
    password=PWD,   
    host=HOST,
    port=PORT,
    database=DB,
)

engine = create_engine(url, pool_pre_ping=True)

# teste rápido
with engine.begin() as conn:
    print("DB atual:", conn.execute(text("SELECT current_database()")).scalar())
    print("Versão:",  conn.execute(text("SELECT version()")).scalar().splitlines()[0])


DB atual: PaipeTech
Versão: PostgreSQL 18.0 on x86_64-windows, compiled by msvc-19.44.35215, 64-bit


In [133]:
df = pd.read_sql(text("SELECT * FROM public.df_train"), engine)

In [137]:
len(df.drop_duplicates())

325260

### Transformando as variáveis necessárias em int

In [139]:
cols_int = [
    "mintimetoneareststation", "maxtimetoneareststation", "totalfloorarea",
    "buildingyear", "coverageratio", "floorarearatio"
]
for c in cols_int:
    df[c] = (pd.to_numeric(df[c], errors="coerce")    
               .round()                              
               .astype("Int64")) 

In [140]:
df["buildingyear"] = (pd.to_numeric(df["buildingyear"], errors="coerce")
                        .round()
                        .astype("Int64"))

### Tratando valores nulos

In [197]:
((df.isna().sum() / len(df)) * 100).sort_values(ascending=False) # Percentual de nulos em cada feature

remarks                        92.191170
pricepertsubo                  80.451024
unitprice                      80.451024
totalfloorarea                 66.486811
purpose                        61.199963
renovation                     58.980815
floorplan                      56.135399
frontage                       50.149111
breadth                        46.514788
classification                 46.224559
direction                      45.621042
landshape                      45.616430
region                         45.518662
use                            24.049376
buildingyear                   22.673553
structure                      21.678042
maxtimetoneareststation         2.576708
mintimetoneareststation         2.538892
timetoneareststation            2.538892
floorarearatio                  1.430548
coverageratio                   1.430548
cityplanning                    0.928488
neareststation                  0.414130
districtname                    0.063641
year            

In [145]:
df_tratado = df.copy()

In [151]:
#remoção das colunas pricepertsubo, unitprice e remarks, pois são majoritariamente nulas
df_tratado.drop(columns=['unitprice', 'pricepertsubo', 'remarks'], inplace = True)

### criando features

In [154]:
df_tratado["time_to_station_mean"] = df_tratado[["mintimetoneareststation","maxtimetoneareststation"]].mean(axis=1)

In [156]:
# Criando ID único normalizando os nomes
def norm(s):
    if pd.isna(s): return 'UNK'
    s = str(s).strip()
    s = ud.normalize('NFKC', s)     
    s = ' '.join(s.split())         
    return s

df_tratado['district_norm'] = df_tratado['districtname'].map(norm)

df_tratado['district_uid'] = df_tratado['municipalitycode'].astype(str) + '|' + df_tratado['district_norm']

In [158]:
df_tratado["yq"] = pd.PeriodIndex(
    year=df_tratado["year"], quarter=df_tratado["quarter"], freq="Q"
).to_timestamp(how="end")

  df_tratado["yq"] = pd.PeriodIndex(


# Pré-processamento de dados

Transformando valores zerados em Nan. Os modelos de árvores lidam bem com valores Nan, não podemos deixar eles zerados

In [211]:
df_tratado[[totalfloorarea", 'time_to_station_mean', 'mintimetoneareststation', 'maxtimetoneareststation']] = df_tratado[[totalfloorarea", 'time_to_station_mean', 'mintimetoneareststation', 'maxtimetoneareststation']].replace(0, np.nan)

SyntaxError: invalid syntax (586400955.py, line 1)

Aplicando log para reduzir a influência de outliers e tornar as relações mais lineares. Como usarei modelos baseados em árvores não é obrigatório fazer isso, mas pode ajudar a melhorar os resultados.

In [189]:
df_tratado["y_log"]   = np.log1p(df_tratado["tradeprice"])
df_tratado["log_area"] = np.log1p(df_tratado["totalfloorarea"])

Declarando as variáveis categóricas que serão utilizadas no modelo.

In [191]:
cat_cols = [c for c in ["type","municipality","region","district_uid", 'landshape', 'structure', 'use', 'purpose', 'direction', 'classification', 'cityplanning', 'period', 'renovation'] if c in df_tratado.columns]

# para LightGBM/XGBoost com categórica nativa (xgboost>=1.6): deixe como 'category'
for c in cat_cols:
    df_tratado[c] = df_tratado[c].astype("category")

print("Categóricas:", cat_cols)

Categóricas: ['type', 'municipality', 'region', 'district_uid', 'landshape', 'structure', 'use', 'purpose', 'direction', 'classification', 'cityplanning', 'period', 'renovation']


# Separando em Treino e Cross Validation

A validação cruzada ajuda a medir a performance do modelo e também orienta na seleção do melhor algoritmo e na otimização dos hiperparâmetros.
Cross Validation é uma estratégia utilizada para avaliar o desempenho de modelos preditivos. A ideia central é dividir os dados em partes diferentes para que o modelo seja treinado em uma parte e testado em outra. Este processo simula o comportamento do modelo ao encontrar novos dados.

https://www.datageeks.com.br/cross-validation/

- Separando em 5 grupos
- Utilizando stratified pois é mais indicado para variáveis alvo assimétricas, elas mantém a mesma distribuição do alvo em cada fold
- cria coluna com marcação do grupo
- Cada grupo tem faixas de tamanhos parecidos

In [216]:
# ========= Ajuste aqui =========
TARGET_COL = "y_log"          # ou "tradeprice"
N_SPLITS   = 5
SEED       = 42
USE_STRATIFIED = True         # True = estratificado por bins do alvo, tenta manter a mesma distribuição do alvo em cada fold. Ideal para targets muito assimétricos.
N_BINS_STRAT = 10             # nº de faixas do alvo p/ estratificação
# ===============================

df_train = df_tratado.copy()

# Função para atribuir folds
def add_cv_folds_regression(df, target_col, n_splits=5, seed=42, use_stratified=True, n_bins=10):
    df = df.copy()
    df["cv_fold"] = -1
    
    if use_stratified:
        # Cria bins do alvo para estratificação
        # Usamos qcut com fallback para cut se houver muitos valores repetidos
        try:
            y_bins = pd.qcut(df[target_col], q=n_bins, duplicates="drop").cat.codes
        except Exception:
            y_bins = pd.cut(df[target_col], bins=n_bins, include_lowest=True).cat.codes
        
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
        for fold_id, (_, val_idx) in enumerate(skf.split(df, y_bins)):
            df.loc[df.index[val_idx], "cv_fold"] = fold_id
    else:
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        for fold_id, (_, val_idx) in enumerate(kf.split(df)):
            df.loc[df.index[val_idx], "cv_fold"] = fold_id
    
    return df

df_train = add_cv_folds_regression(
    df=df_train,
    target_col=TARGET_COL,
    n_splits=N_SPLITS,
    seed=SEED,
    use_stratified=USE_STRATIFIED,
    n_bins=N_BINS_STRAT
)

# Visualização rápida da distribuição por fold
fold_summary = (
    df_train.groupby("cv_fold")[TARGET_COL]
        .agg(["count","mean","std","min","max"])
        .reset_index()
        .sort_values("cv_fold")
)
print(fold_summary.head(10))


   cv_fold  count       mean       std       min        max
0        0  65052  17.369098  0.922481  9.472782  24.529928
1        1  65052  17.368868  0.919663  6.908755  24.834140
2        2  65052  17.369645  0.924458  7.090910  23.814308
3        3  65052  17.372055  0.923085  6.908755  24.019103
4        4  65052  17.371177  0.921503  6.908755  24.360852


In [261]:
# 1) Garanta as categóricas no df_train (cópia do df_tratado)
cat_cols = [c for c in ["type","municipality","region","district_uid",
                        "landshape","structure","use","purpose","direction",
                        "classification","cityplanning","period","renovation"]
            if c in df_train.columns]

for c in cat_cols:
    df_train[c] = df_train[c].astype("category")

print("Categóricas (df_train):", cat_cols)

# 2) Defina features (tudo exceto alvo e cv_fold)
X_cols = [c for c in df_train.columns if c not in [TARGET_COL, "cv_fold"]]

# (Opcional) Índices das categóricas dentro de X_cols — necessário para CatBoost
cat_idx = [df_train[X_cols].columns.get_loc(c) for c in cat_cols if c in X_cols]

# 3) Bandeiras úteis
IS_LOG_TARGET = (TARGET_COL == "y_log")
PRIMARY_METRIC = "rmsle"  # ou "mape"


Categóricas (df_train): ['type', 'municipality', 'region', 'district_uid', 'landshape', 'structure', 'use', 'purpose', 'direction', 'classification', 'cityplanning', 'period', 'renovation']


In [263]:
PRIMARY_METRIC = "rmsle"   # mude para "mape" se quiser otimizar MAPE
EPS = 1e-12                # proteção para divisões por ~0 no MAPE

def rmsle_from_preds(y_true, y_pred, is_log_target: bool):
    # Se o target é y_log, RMSLE == RMSE no espaço log
    if is_log_target:
        return mean_squared_error(y_true, y_pred, squared=False)
    y_true_log = np.log1p(np.clip(y_true, a_min=0, a_max=None))
    y_pred_log = np.log1p(np.clip(y_pred, a_min=0, a_max=None))
    return mean_squared_error(y_true_log, y_pred_log, squared=False)

def mape_from_preds(y_true, y_pred, is_log_target: bool):
    # MAPE sempre na escala original
    if is_log_target:
        y_true = np.expm1(y_true)
        y_pred = np.expm1(y_pred)
    denom = np.maximum(np.abs(y_true), EPS)
    return float(np.mean(np.abs((y_pred - y_true) / denom)) * 100.0)

def pick_metric(y_true, y_pred, is_log_target: bool, primary: str):
    rmsle = rmsle_from_preds(y_true, y_pred, is_log_target)
    mape  = mape_from_preds(y_true, y_pred, is_log_target)
    value = rmsle if primary == "rmsle" else mape
    return value, {"rmsle": rmsle, "mape": mape}


In [269]:
df_train = df_train.copy()

# --- 1) Tratar datetime(s) ---
datetime_cols = df_train.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()
# Caso 'yq' esteja como string "2024Q1", transforme antes:
if "yq" in df_train.columns and df_train["yq"].dtype == "object":
    # tenta converter "YYYYQn" para um período trimestral
    df_train["yq"] = pd.PeriodIndex(df_train["yq"].astype(str), freq="Q").to_timestamp(how="end")

datetime_cols = list(set(datetime_cols + [c for c in ["yq"] if c in df_train.columns and np.issubdtype(df_train[c].dtype, np.datetime64)]))

for c in datetime_cols:
    df_train[f"{c}_year"]    = df_train[c].dt.year.astype("Int64")
    df_train[f"{c}_month"]   = df_train[c].dt.month.astype("Int64")
    df_train[f"{c}_quarter"] = df_train[c].dt.quarter.astype("Int64")
    df_train[f"{c}_dow"]     = df_train[c].dt.dayofweek.astype("Int64")
# Remova a coluna datetime original (LGBM não aceita)
df_train.drop(columns=datetime_cols, inplace=True)

# --- 2) Extrair número de "timetoneareststation" se for string tipo "12min" ---
if "timetoneareststation" in df_train.columns:
    s = df_train["timetoneareststation"].astype(str).str.extract(r"(-?\d+)", expand=False)
    df_train["timetoneareststation_min"] = pd.to_numeric(s, errors="coerce")
    # Se quiser manter a categórica original também, ok; se não, pode dropar:
    # df_train.drop(columns=["timetoneareststation"], inplace=True)

# --- 3) Converter objetos restantes para category ---
obj_cols = df_train.select_dtypes(include=["object"]).columns.tolist()
for c in obj_cols:
    df_train[c] = df_train[c].astype("category")

# Monte listas finais
X_cols = [c for c in df_train.columns if c not in [TARGET_COL, "cv_fold"]]
cat_cols = df_train[X_cols].select_dtypes(include=["category"]).columns.tolist()
cat_idx  = [df_train[X_cols].columns.get_loc(c) for c in cat_cols]
print("Categóricas (detec.):", cat_cols)


Categóricas (detec.): ['type', 'region', 'prefecture', 'municipality', 'districtname', 'neareststation', 'timetoneareststation', 'floorplan', 'landshape', 'structure', 'use', 'purpose', 'direction', 'classification', 'cityplanning', 'period', 'renovation', 'district_norm', 'district_uid']


In [271]:
import optuna
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

PRIMARY_METRIC = "rmsle"  # ou "mape"
IS_LOG_TARGET = (TARGET_COL == "y_log")
EPS = 1e-12

def rmsle_from_preds(y_true, y_pred, is_log_target: bool):
    if is_log_target:
        return mean_squared_error(y_true, y_pred, squared=False)
    return mean_squared_error(np.log1p(np.clip(y_true,0,None)),
                              np.log1p(np.clip(y_pred,0,None)), squared=False)

def mape_from_preds(y_true, y_pred, is_log_target: bool):
    if is_log_target:
        y_true, y_pred = np.expm1(y_true), np.expm1(y_pred)
    denom = np.maximum(np.abs(y_true), EPS)
    return float(np.mean(np.abs((y_pred - y_true) / denom)) * 100.0)

def pick_metric(y_true, y_pred, is_log_target: bool, primary: str):
    rmsle = rmsle_from_preds(y_true, y_pred, is_log_target)
    mape  = mape_from_preds(y_true, y_pred, is_log_target)
    return (rmsle if primary=="rmsle" else mape), {"rmsle":rmsle, "mape":mape}

def objective_lgbm(trial: optuna.trial.Trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 1500, 6000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.15, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 15, 255),
        "max_depth": trial.suggest_int("max_depth", -1, 16),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": SEED,
        "n_jobs": -1,
    }

    scores = []
    for fold in range(N_SPLITS):
        tr = df_train[df_train["cv_fold"] != fold]
        va = df_train[df_train["cv_fold"] == fold]

        X_tr, y_tr = tr[X_cols], tr[TARGET_COL]
        X_va, y_va = va[X_cols], va[TARGET_COL]

        model = LGBMRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            categorical_feature=cat_cols,  # <<< informar nomes das categóricas
            callbacks=[
                lgb.early_stopping(stopping_rounds=200, verbose=False),
                lgb.log_evaluation(period=0),
            ],
        )
        pred = model.predict(X_va, num_iteration=model.best_iteration_)
        value, both = pick_metric(y_va.values, pred, IS_LOG_TARGET, PRIMARY_METRIC)

        trial.set_user_attr(f"fold{fold}_rmsle", both["rmsle"])
        trial.set_user_attr(f"fold{fold}_mape",  both["mape"])
        scores.append(value)

    trial.set_user_attr("cv_mean_rmsle", float(np.mean([trial.user_attrs[f"fold{i}_rmsle"] for i in range(N_SPLITS)])))
    trial.set_user_attr("cv_mean_mape",  float(np.mean([trial.user_attrs[f"fold{i}_mape"]  for i in range(N_SPLITS)])))
    return float(np.mean(scores))


In [273]:
study_lgbm = optuna.create_study(direction="minimize", study_name="lgbm_rmsle_mape")
study_lgbm.optimize(objective_lgbm, n_trials=50, show_progress_bar=False)

print("LGBM | métrica otimizada:", PRIMARY_METRIC.upper())
print("LGBM | best value:", study_lgbm.best_value)
print("LGBM | best params:", study_lgbm.best_params)

best_trial = study_lgbm.best_trial
print("CV mean RMSLE:", best_trial.user_attrs.get("cv_mean_rmsle"))
print("CV mean MAPE :", best_trial.user_attrs.get("cv_mean_mape"))


[I 2025-10-05 14:29:45,138] A new study created in memory with name: lgbm_rmsle_mape


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.177082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6467
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370436




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.170962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6478
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370494




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.121481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6461
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370299




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.208172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6472
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.369697




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.186757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6478
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.369916


[I 2025-10-05 15:40:02,328] Trial 0 finished with value: 0.046707414188057336 and parameters: {'n_estimators': 5469, 'learning_rate': 0.0031670797572642064, 'num_leaves': 215, 'max_depth': 5, 'min_child_samples': 111, 'subsample': 0.8920752157966243, 'colsample_bytree': 0.7389341137339216, 'reg_alpha': 0.00015865188707586502, 'reg_lambda': 0.02590064385534672}. Best is trial 0 with value: 0.046707414188057336.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.121807 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6467
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370436




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.209392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6478
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370494




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6461
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370299




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.101566 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6472
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.369697




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.349189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6478
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.369916


[I 2025-10-05 16:16:56,590] Trial 1 finished with value: 0.043229293184852684 and parameters: {'n_estimators': 3551, 'learning_rate': 0.011280173418344374, 'num_leaves': 199, 'max_depth': 5, 'min_child_samples': 59, 'subsample': 0.9236350738547718, 'colsample_bytree': 0.7518755444201586, 'reg_alpha': 0.017662230208978946, 'reg_lambda': 9.447154586821838e-07}. Best is trial 1 with value: 0.043229293184852684.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6467
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370436




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6476
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 42
[LightGBM] [Info] Start training from score 17.370494




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6459
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 42
[LightGBM] [Info] Start training from score 17.370299




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6470
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 42
[LightGBM] [Info] Start training from score 17.369697




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6476
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 42
[LightGBM] [Info] Start training from score 17.369916


[I 2025-10-05 16:17:58,875] Trial 2 finished with value: 0.044830831754890804 and parameters: {'n_estimators': 5198, 'learning_rate': 0.05233114168994675, 'num_leaves': 191, 'max_depth': 5, 'min_child_samples': 199, 'subsample': 0.8368336624915496, 'colsample_bytree': 0.9804311578275458, 'reg_alpha': 3.5954338841634723e-06, 'reg_lambda': 1.515705970062667e-08}. Best is trial 1 with value: 0.043229293184852684.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013839 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6467
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370436




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6478
[LightGBM] [Info] Number of data points in the train set: 260208, number of used features: 43
[LightGBM] [Info] Start training from score 17.370494


[W 2025-10-05 16:19:43,715] Trial 3 failed with parameters: {'n_estimators': 5378, 'learning_rate': 0.14778288699463982, 'num_leaves': 172, 'max_depth': 8, 'min_child_samples': 196, 'subsample': 0.8413745174545074, 'colsample_bytree': 0.8457631987151196, 'reg_alpha': 1.9286973233928386, 'reg_lambda': 0.7381161160331048} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\cicer\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\cicer\AppData\Local\Temp\ipykernel_39836\1167516592.py", line 61, in objective_lgbm
    pred = model.predict(X_va, num_iteration=model.best_iteration_)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cicer\anaconda3\Lib\site-packages\lightgbm\sklearn.py", line 1144, in predict
    return self._Booster.predict(  # type: ignore[union-attr]
           ^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [274]:
import json, os
os.makedirs("models_lgbm", exist_ok=True)
with open("models_lgbm/lgbm_best_so_far.json", "w", encoding="utf-8") as f:
    json.dump(study_lgbm.best_params, f, ensure_ascii=False, indent=2)


In [276]:
len(study_lgbm.trials)
study_lgbm.best_value
study_lgbm.best_params

{'n_estimators': 3551,
 'learning_rate': 0.011280173418344374,
 'num_leaves': 199,
 'max_depth': 5,
 'min_child_samples': 59,
 'subsample': 0.9236350738547718,
 'colsample_bytree': 0.7518755444201586,
 'reg_alpha': 0.017662230208978946,
 'reg_lambda': 9.447154586821838e-07}

In [None]:
SAVE_DIR = "models_lgbm"
os.makedirs(SAVE_DIR, exist_ok=True)

best_params = study_lgbm.best_params.copy()
best_params.update({
    "random_state": SEED,
    "n_jobs": -1,
})

X_all = df_train[X_cols]
y_all = df_train[TARGET_COL]

final_lgbm = LGBMRegressor(**best_params)
final_lgbm.fit(X_all, y_all)

params_path = os.path.join(SAVE_DIR, "lgbm_best_params.json")
model_path  = os.path.join(SAVE_DIR, "lgbm_model_final.pkl")

with open(params_path, "w", encoding="utf-8") as f:
    json.dump(best_params, f, ensure_ascii=False, indent=2)
joblib.dump(final_lgbm, model_path)

print(f"✅ Parâmetros salvos em: {params_path}")
print(f"✅ Modelo salvo em:      {model_path}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6471
[LightGBM] [Info] Number of data points in the train set: 325260, number of used features: 43
[LightGBM] [Info] Start training from score 17.370168
