In [1]:
import sys
sys.path.append("../src")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import randint

from features import parse_vol, compute_rsi

from model_utils import save_artifacts

In [2]:
df = pd.read_csv(
    "../data/dados_historicos_ibovespa230722230725.csv",
    parse_dates=["Date"],
    dayfirst=True,
    thousands=","
)

df.sort_values("Date", inplace=True)
df.set_index("Date", inplace=True)

for col in ["Price", "Open", "High", "Low"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["Vol."] = df["Vol."].apply(parse_vol)
df["Change %"] = df["Change %"].str.replace("%", "").astype(float)


  df = pd.read_csv(


In [3]:
df["target"] = (df["Price"].shift(-1) > df["Price"]).astype(int)


In [4]:
df["return_1d"] = df["Price"].pct_change()
df["return_5d"] = df["Price"].pct_change(5)
df["return_10d"] = df["Price"].pct_change(10)
df["vol_chg_5d"] = df["Vol."].pct_change(5)
df["high_low_spread"] = (df["High"] - df["Low"]) / df["Low"]
df["ma_5"] = df["Price"].rolling(5).mean()
df["ma_10"] = df["Price"].rolling(10).mean()
df["ma_20"] = df["Price"].rolling(20).mean()
df["ma_diff_5_20"] = df["ma_5"] - df["ma_20"]
df["rsi_14"] = compute_rsi(df["Price"], 14)
df["volatility_20"] = df["Price"].pct_change().rolling(20).std()

df.fillna(0, inplace=True)


In [5]:
features = [
    "Open", "High", "Low", "Vol.", "Change %",
    "return_1d", "return_5d", "return_10d", "vol_chg_5d", "high_low_spread",
    "ma_5", "ma_10", "ma_20", "ma_diff_5_20",
    "rsi_14", "volatility_20"
]


In [6]:
X = df[features]
y = df["target"]

X_train, X_test = X.iloc[:-30], X.iloc[-30:]
y_train, y_test = y.iloc[:-30], y.iloc[-30:]

# (RandomizedSearch + XGB igual ao seu código)


In [8]:
# -------------------------
# 1. Configuração do Modelo e Busca de Hiperparâmetros
# -------------------------
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 15),
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': randint(1, 10)
}

# Validação cruzada para séries temporais
tscv = TimeSeriesSplit(n_splits=5)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

print("Iniciando busca de hiperparâmetros (RandomizedSearch)...")
search = RandomizedSearchCV(
    xgb, 
    param_distributions=param_dist, 
    n_iter=50, 
    cv=tscv, 
    scoring='accuracy', 
    random_state=42, 
    n_jobs=-1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_
print("Melhores parâmetros encontrados:", search.best_params_)

# -------------------------
# 2. Otimização do Threshold de Decisão
# -------------------------
probs = best_model.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.50, 0.66, 0.01)
accuracies = []

for threshold in thresholds:
    y_temp = (probs >= threshold).astype(int)
    acc = accuracy_score(y_test, y_temp)
    accuracies.append(acc)

best_threshold = thresholds[np.argmax(accuracies)]
y_pred_final = (probs >= best_threshold).astype(int)

print(f"Melhor threshold: {best_threshold:.2f} com Acurácia: {max(accuracies)*100:.2f}%")

Iniciando busca de hiperparâmetros (RandomizedSearch)...


Parameters: { "use_label_encoder" } are not used.



Melhores parâmetros encontrados: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 11, 'min_child_weight': 3, 'n_estimators': 252, 'subsample': 0.6}
Melhor threshold: 0.59 com Acurácia: 80.00%


In [9]:
import joblib
import json

joblib.dump(best_model, "../modelo/xgb.joblib")
joblib.dump(best_threshold, "../modelo/threshold.joblib")
joblib.dump(features, "../modelo/features.joblib")

metrics = {
    "accuracy": accuracy_score(y_test, y_pred_final),
    "best_threshold": float(best_threshold),
    "classification_report": classification_report(y_test, y_pred_final, output_dict=True)
}

with open("../modelo/metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)


In [None]:
whos
