In [290]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import os, json, joblib
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL
from pathlib import Path
import unicodedata as ud
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
import optuna
from sklearn.metrics import mean_squared_error,  mean_squared_log_error, mean_absolute_percentage_error
from lightgbm import LGBMRegressor
import lightgbm as lgb

# Tratamento de dados

### Conectando com o Banco de Dados

In [11]:
load_dotenv()

HOST = os.getenv("PGHOST")
PORT = os.getenv("PGPORT")
DB   = os.getenv("PGDATABASE")
USR  = os.getenv("PGUSER")
PWD  = os.getenv("PGPASSWORD")

In [14]:
# 1) Caminho .env local
ENV_PATH = Path(r"C:\Users\cicer\Documents\Case Técnico Paipe\Análise Exploratória\.env")
print("Arquivo .env existe?", ENV_PATH.exists(), "\nCaminho:", ENV_PATH)

# 2) Carrega o .env
load_dotenv(dotenv_path=ENV_PATH, override=True)

# 3) Confere o que foi lido
cfg = {k: os.getenv(k) for k in ["PGHOST", "PGPORT", "PGDATABASE", "PGUSER"]}
print(cfg, "| PGPASSWORD set?", bool(os.getenv("PGPASSWORD")))


Arquivo .env existe? True 
Caminho: C:\Users\cicer\Documents\Case Técnico Paipe\Análise Exploratória\.env
{'PGHOST': 'localhost', 'PGPORT': '5432', 'PGDATABASE': 'PaipeTech', 'PGUSER': 'postgres'} | PGPASSWORD set? True


In [17]:
HOST = os.getenv("PGHOST", "localhost")
PORT = int(os.getenv("PGPORT") or 5432)
DB   = os.getenv("PGDATABASE")
USR  = os.getenv("PGUSER")
PWD  = os.getenv("PGPASSWORD")

url = URL.create(
    "postgresql+psycopg2",
    username=USR,
    password=PWD,   
    host=HOST,
    port=PORT,
    database=DB,
)

engine = create_engine(url, pool_pre_ping=True)

# teste rápido
with engine.begin() as conn:
    print("DB atual:", conn.execute(text("SELECT current_database()")).scalar())
    print("Versão:",  conn.execute(text("SELECT version()")).scalar().splitlines()[0])


DB atual: PaipeTech
Versão: PostgreSQL 18.0 on x86_64-windows, compiled by msvc-19.44.35215, 64-bit


In [20]:
df = pd.read_sql(text("SELECT * FROM public.df_train"), engine)

## Pré Processamento

### Transformando as variáveis necessárias em int

In [30]:
cols_int = [
    "mintimetoneareststation", "maxtimetoneareststation", "totalfloorarea",
    "buildingyear", "coverageratio", "floorarearatio", "buildingyear"
]
for c in cols_int:
    df[c] = (pd.to_numeric(df[c], errors="coerce")    
               .round()                              
               .astype("Int64")) 

### Dropando as 3 colunas com maior porcentagem de nulos, ela não agregam informação ao modelo

In [41]:
((df.isna().sum() / len(df)) * 100).sort_values(ascending=False).head(3) # Percentual de nulos em cada feature

remarks          92.191170
pricepertsubo    80.451024
unitprice        80.451024
dtype: float64

In [44]:
df.drop(columns=['unitprice', 'pricepertsubo', 'remarks'], inplace = True)

### Criando features

In [50]:
df["time_to_station_mean"] = df[["mintimetoneareststation","maxtimetoneareststation"]].mean(axis=1)

In [53]:
df['district_uid'] = df['municipalitycode'].astype(str) + '|' + df['districtname']

In [68]:
df["log_totalfloorarea"] = np.log1p(df["totalfloorarea"])
df["log_area"] = np.log1p(df["area"])
df["log_frontage"] = np.log1p(df["frontage"])
df["log_breadth"] = np.log1p(df["breadth"])

In [73]:
df.columns

Index(['id', 'type', 'region', 'municipalitycode', 'prefecture',
       'municipality', 'districtname', 'neareststation',
       'timetoneareststation', 'mintimetoneareststation',
       'maxtimetoneareststation', 'floorplan', 'area', 'areaisgreaterflag',
       'landshape', 'frontage', 'frontageisgreaterflag', 'totalfloorarea',
       'totalfloorareaisgreaterflag', 'buildingyear', 'prewarbuilding',
       'structure', 'use', 'purpose', 'direction', 'classification', 'breadth',
       'cityplanning', 'coverageratio', 'floorarearatio', 'period', 'year',
       'quarter', 'renovation', 'tradeprice', 'time_to_station_mean',
       'district_uid', 'log_totalfloorarea', 'log_area', 'log_frontage',
       'log_breadth'],
      dtype='object')

### Transformando 0 em Nan em variáveis numéricas, pois modelos baseados em árvores lidam melhor com Nan

In [81]:
df[["mintimetoneareststation", 'maxtimetoneareststation', 'time_to_station_mean']] = df[["mintimetoneareststation", 'maxtimetoneareststation', 'time_to_station_mean']].replace(0, np.nan)

### Tratando colunas categóricas para usar no modelo

In [101]:
cat_cols = [['type', 'region', 'municipality', 'districtname', 'floorplan', 'landshape',
       'structure', 'use', 'purpose', 'direction', 'classification',
       'cityplanning', 'renovation', 'district_uid']]

In [154]:
for c in cat_cols:
    df[c] = df[c].astype("category")

### Separando variáveis de treino e teste

**Separando de acordo com a linha do tempo das transações. 80% dos dados para treinar e os 20% mais recentes para testar!**

In [216]:
strata = df["Year"].astype(int)

# === 2) Encontrar o corte de 80% por tempo (agrupado por YQ)
cnt = df.groupby("YQ").size().sort_index()
cum = cnt.cumsum() / cnt.sum()   # proporção acumulada ao longo do tempo
cut_yq = cum.index[(cum <= 0.80)].max()  # último YQ ainda dentro dos 80%

X = df[df["YQ"] <= cut_yq]
y  = df[df["YQ"] >  cut_yq]

In [268]:
X_train = X[['type', 'region', 'municipalitycode','municipality', 'districtname', 'mintimetoneareststation',
       'maxtimetoneareststation', 'floorplan', 'area', 'landshape', 'frontage', 'totalfloorarea', 'buildingyear',
       'structure', 'use', 'purpose', 'direction', 'classification', 'breadth',  'cityplanning', 'coverageratio',
       'floorarearatio', 'year', 'quarter', 'renovation', 'time_to_station_mean', 'district_uid', 'log_totalfloorarea',
       'log_area', 'log_frontage', 'log_breadth']]

y_train = pd.DataFrame(X.tradeprice)

X_test = y[['type', 'region', 'municipalitycode','municipality', 'districtname', 'mintimetoneareststation',
       'maxtimetoneareststation', 'floorplan', 'area', 'landshape', 'frontage', 'totalfloorarea', 'buildingyear',
       'structure', 'use', 'purpose', 'direction', 'classification', 'breadth',  'cityplanning', 'coverageratio',
       'floorarearatio', 'year', 'quarter', 'renovation', 'time_to_station_mean', 'district_uid', 'log_totalfloorarea',
       'log_area', 'log_frontage', 'log_breadth']]

y_test = pd.DataFrame(y.tradeprice)

In [272]:
y_train["y_log"] = np.log1p(y_train["tradeprice"])
y_test["y_log"] = np.log1p(y_test["tradeprice"])

y_train.drop(columns='tradeprice', inplace = True)
y_test.drop(columns='tradeprice', inplace = True)

### Separando os folds para cross validation

**Usando o Kfold para validação cruzada, porém, cada fold terá uma faixa do tempo, para o modelo tentar captar estes movimentos**

In [278]:
yq_train = df["YQ"]
groups_time = yq_train.loc[X_train.index]   # garantir mesmo índice/tamanho

groups_id_train = df["district_uid"].loc[X_train.index]

# gerar índices de folds - lista de tuplas
folds = []
if groups_time is not None and groups_time.nunique() >= 3:
    gkf = GroupKFold(n_splits=min(5, groups_time.nunique()))
    for tr, va in gkf.split(X_train, y_train, groups=groups_time):
        folds.append((tr, va))


In [282]:
for i, (tr, va) in enumerate(folds):
    print(f"Fold {i+1}: treino = {len(tr):,}, validação = {len(va):,}")


Fold 1: treino = 205,522, validação = 54,123
Fold 2: treino = 208,309, validação = 51,336
Fold 3: treino = 208,271, validação = 51,374
Fold 4: treino = 208,252, validação = 51,393
Fold 5: treino = 208,226, validação = 51,419


### Utilizando o Optuna para a tunagem de hiper Parâmetros

In [292]:
# ================================
# CONFIGURAÇÕES BÁSICAS
# ================================
PRIMARY_METRIC = "rmsle"
IS_LOG_TARGET = True          # True se y_train for log(y)
N_TRIALS = 20                 # ~20 min, ajustável
SEED = 42

# ================================
# FUNÇÕES DE MÉTRICAS
# ================================
def rmsle(y_true, y_pred):
    y_true, y_pred = np.expm1(y_true), np.expm1(y_pred)
    return np.sqrt(mean_squared_log_error(y_true, np.maximum(y_pred, 0)))

def mape(y_true, y_pred):
    y_true, y_pred = np.expm1(y_true), np.expm1(y_pred)
    return mean_absolute_percentage_error(y_true, np.maximum(y_pred, 0)) * 100

# ================================
# FUNÇÃO OBJETIVO OPTUNA
# ================================
def objective(trial):
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [1000, 2000, 3000]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.01, 0.05, 0.1]),
        "num_leaves": trial.suggest_categorical("num_leaves", [31, 63, 127]),
        "max_depth": trial.suggest_categorical("max_depth", [6, 10, -1]),
        "subsample": trial.suggest_categorical("subsample", [0.7, 0.9, 1.0]),
        "colsample_bytree": trial.suggest_categorical("colsample_bytree", [0.7, 0.9, 1.0]),
        "min_child_samples": trial.suggest_categorical("min_child_samples", [10, 50, 100]),
        "reg_alpha": trial.suggest_categorical("reg_alpha", [0.0, 0.1, 0.5]),
        "reg_lambda": trial.suggest_categorical("reg_lambda", [0.0, 0.1, 0.5]),
        "random_state": SEED,
        "n_jobs": -1,
    }

    rmsle_scores, mape_scores = [], []

    for fold, (tr, va) in enumerate(folds):
        X_tr, X_va = X_train.iloc[tr], X_train.iloc[va]
        y_tr, y_va = y_train.iloc[tr], y_train.iloc[va]

        model = LGBMRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            callbacks=[
                lgb.early_stopping(stopping_rounds=200, verbose=False),
                lgb.log_evaluation(period=0)
            ]
        )

        pred = model.predict(X_va, num_iteration=model.best_iteration_)
        rmsle_scores.append(rmsle(y_va, pred))
        mape_scores.append(mape(y_va, pred))

    trial.set_user_attr("mean_rmsle", np.mean(rmsle_scores))
    trial.set_user_attr("mean_mape", np.mean(mape_scores))

    return np.mean(rmsle_scores) if PRIMARY_METRIC == "rmsle" else np.mean(mape_scores)

# ================================
# EXECUÇÃO DO OPTUNA
# ================================
sampler = optuna.samplers.TPESampler(seed=SEED)
study_lgbm = optuna.create_study(direction="minimize", sampler=sampler)
study_lgbm.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

# ================================
# RESULTADOS
# ================================
print("✅ Melhor trial:")
print(study_lgbm.best_trial.number)
print("✅ RMSLE médio:", study_lgbm.best_trial.user_attrs["mean_rmsle"])
print("✅ MAPE médio:", study_lgbm.best_trial.user_attrs["mean_mape"])
print("✅ Parâmetros:")
for k, v in study_lgbm.best_params.items():
    print(f"  {k}: {v}")


[I 2025-10-05 22:51:23,087] A new study created in memory with name: no-name-2ec13110-c0ed-4707-b450-d97a5ee22521


  0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006281 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4584
[LightGBM] [Info] Number of data points in the train set: 205522, number of used features: 31
[LightGBM] [Info] Start training from score 17.353764
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4580
[LightGBM] [Info] Number of data points in the train set: 208309, number of used features: 31
[LightGBM] [Info] Start training from score 17.357131
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027749 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4581
[LightGBM] [Info] Number of data points in the train


KeyboardInterrupt

