In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('DadosHistoricos_Ibovespa.csv')

In [3]:
df.head()

Unnamed: 0,Data,Último,Abertura,Máxima,Mínima,Vol.,Var%
0,16.06.2025,139.256,137.212,139.988,137.212,"7,62M","1,49%"
1,13.06.2025,137.213,137.8,137.8,136.586,"8,63B","-0,43%"
2,12.06.2025,137.8,137.127,137.931,136.175,"7,12B","0,49%"
3,11.06.2025,137.128,136.443,137.531,135.628,"8,77B","0,51%"
4,10.06.2025,136.436,135.716,137.369,135.716,"8,19B","0,54%"


In [4]:
# Conversão object para data sem horario
df["Data"] = pd.to_datetime(df['Data'], format='%d.%m.%Y', dayfirst=True).dt.strftime('%Y-%m-%d')
df.sort_values(by='Data', ascending=True, inplace=True)

In [5]:
df.head()

Unnamed: 0,Data,Último,Abertura,Máxima,Mínima,Vol.,Var%
2481,2015-06-15,53.138,53.338,53.338,52.548,"2,69M","-0,39%"
2480,2015-06-16,53.702,53.144,53.969,53.107,"3,38M","1,06%"
2479,2015-06-17,53.249,53.698,53.755,52.965,"3,09M","-0,84%"
2478,2015-06-18,54.239,53.251,54.352,53.214,"2,75M","1,86%"
2477,2015-06-19,53.749,54.236,54.236,53.479,"2,95M","-0,90%"


In [9]:
def parse_volume(vol):
    try:
        vol = str(vol).replace('.', '').replace(',', '.').strip().upper()
        if 'K' in vol:
            return float(vol.replace('K', '')) * 1e3
        elif 'M' in vol:
            return float(vol.replace('M', '')) * 1e6
        elif 'B' in vol:
            return float(vol.replace('B', '')) * 1e9
        else:
            return float(vol)
    except:
        return None  # ou 0, se preferir não deixar NaN


In [12]:
df['Vol.'] = df['Vol.'].apply(lambda x: parse_volume(x) if pd.notnull(x) else 0)


In [17]:
df.head()

Unnamed: 0,Data,Último,Abertura,Máxima,Mínima,Vol.,Var%
2481,2015-06-15,53.138,53.338,53.338,52.548,2690000.0,"-0,39%"
2480,2015-06-16,53.702,53.144,53.969,53.107,3380000.0,"1,06%"
2479,2015-06-17,53.249,53.698,53.755,52.965,3090000.0,"-0,84%"
2478,2015-06-18,54.239,53.251,54.352,53.214,2750000.0,"1,86%"
2477,2015-06-19,53.749,54.236,54.236,53.479,2950000.0,"-0,90%"


In [19]:
# variação percentual diária do fechamento
# capturar o “ritmo” de alta ou baixa. Uma variação muito positiva ou negativa pode indicar tendência.

df['pct_change'] = df['Último'].pct_change() * 100

In [21]:
# indicadores técnicos: RSI (14 dias)
from ta.momentum import RSIIndicator

rsi = RSIIndicator(close=df['Último'], window=14)
df['rsi_14'] = rsi.rsi()


In [22]:
# MACD - diferenaça entre as medias moveis de curto e longo prazo, que mostra reversão ou continuação de tendência.
from ta.trend import MACD

macd = MACD(close=df['Último'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
df['macd_diff'] = macd.macd_diff()


In [23]:
# Lags do fechamento - capturar o histórico recente, como se você estivesse "olhando para trás" para tomar a decisão de amanhã.
df['lag_1'] = df['Último'].shift(1)
df['lag_2'] = df['Último'].shift(2)
df['lag_3'] = df['Último'].shift(3)


In [24]:
# variável target (alvo) - É o que vamos prever: se o IBOV vai subir no próximo dia.
df['target'] = (df['Último'].shift(-1) > df['Último']).astype(int)

In [25]:
# Limpando dados após algumas ccriações para eliminar os NaN que temos nas primeiras e ultimas linhas da base
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)


In [26]:
# separando treino e teste
# 1) Garantir que o DataFrame esteja ordenado por data
df = df.sort_values('Data').reset_index(drop=True)

# 2) Definir tamanho do teste (últimos 30 dias)
test_size = 30

# 3) Criar df_train e df_test
df_train = df.iloc[:-test_size].copy()
df_test  = df.iloc[-test_size:].copy()


In [27]:
# definindo x e y
feature_cols = [
    'pct_change', 'rsi_14', 'macd', 'macd_signal', 'macd_diff',
    'lag_1', 'lag_2', 'lag_3'
]

X_train = df_train[feature_cols]
y_train = df_train['target']

X_test  = df_test[feature_cols]
y_test  = df_test['target']


In [28]:
print(X_train.shape, X_test.shape)  
print(X_train.columns == X_test.columns)  # deve ser all True  


(2418, 8) (30, 8)
[ True  True  True  True  True  True  True  True]


In [29]:
# Treinando o Random Forest (baseline)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics   import accuracy_score, confusion_matrix, classification_report

# 1) Instanciar o modelo com parâmetros default
rf = RandomForestClassifier(random_state=42)

# 2) Treinar
rf.fit(X_train, y_train)

# 3) Prever no conjunto de teste
y_pred = rf.predict(X_test)


In [30]:
# Avaliação do desempenho
acc = accuracy_score(y_test, y_pred)
print(f"Acurácia baseline: {acc:.2%}\n")

print("Matriz de Confusão:")
print(confusion_matrix(y_test, y_pred), "\n")

print("Relatório de Classificação:")
print(classification_report(y_test, y_pred))


Acurácia baseline: 50.00%

Matriz de Confusão:
[[13  2]
 [13  2]] 

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.50      0.87      0.63        15
           1       0.50      0.13      0.21        15

    accuracy                           0.50        30
   macro avg       0.50      0.50      0.42        30
weighted avg       0.50      0.50      0.42        30



In [35]:
# calculando media moveis 
# Janela de 5, 10 e 20 dias sobre o fechamento
df['ma_5']  = df['Último'].rolling(window=5).mean()
df['ma_10'] = df['Último'].rolling(window=10).mean()
df['ma_20'] = df['Último'].rolling(window=20).mean()


In [36]:
#Distância do preço às médias
df['close_vs_ma5']  = df['Último'] - df['ma_5']
df['close_vs_ma20'] = df['Último'] - df['ma_20']


In [37]:
#Diferença entre médias (crossover)
df['diff_ma5_ma10']   = df['ma_5']  - df['ma_10']
df['diff_ma10_ma20']  = df['ma_10'] - df['ma_20']


In [38]:
# Cruzamento do MACD
df['macd_cross'] = df['macd'] - df['macd_signal']


In [39]:
# Limpando os NaN
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)


In [40]:
feature_cols = [
    'pct_change',    # variação percentual
    'rsi_14',        # RSI 14
    'macd',          # MACD
    'macd_signal',   # sinal do MACD
    'macd_diff',     # histograma MACD
    'lag_1',         # fechamento t-1
    'lag_2',         # fechamento t-2
    'lag_3',         # fechamento t-3
    'close_vs_ma5',
    'close_vs_ma20',
    'diff_ma5_ma10',
    'diff_ma10_ma20',
    'macd_cross'
]


In [41]:
# 5.1) Ordenar e dividir
df = df.sort_values('Data').reset_index(drop=True)
test_size = 30
df_train = df.iloc[:-test_size]
df_test  = df.iloc[-test_size:]

# 5.2) X e y
X_train = df_train[feature_cols];  y_train = df_train['target']
X_test  = df_test[feature_cols];   y_test  = df_test['target']

# 5.3) Treinar e prever
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics   import accuracy_score, confusion_matrix, classification_report

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# 5.4) Avaliar
print(f"Acurácia: {accuracy_score(y_test, y_pred):.2%}")
print("\nMatriz de Confusão:\n", confusion_matrix(y_test, y_pred))
print("\nRelatório de Classificação:\n", classification_report(y_test, y_pred))


Acurácia: 46.67%

Matriz de Confusão:
 [[11  4]
 [12  3]]

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.48      0.73      0.58        15
           1       0.43      0.20      0.27        15

    accuracy                           0.47        30
   macro avg       0.45      0.47      0.43        30
weighted avg       0.45      0.47      0.43        30



In [42]:
# já temos df ordenado e com pct_change
window = 5

for i in range(1, window+1):
    df[f"ret_{i}"] = df['pct_change'].shift(i)

# Target permanece:
df['target'] = (df['Último'].shift(-1) > df['Último']).astype(int)

# Remover NaNs
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)


In [43]:
feature_cols = [f"ret_{i}" for i in range(1, window+1)]


In [44]:
# 2.1) Separar treino/teste
test_size = 30
df_train = df.iloc[:-test_size]
df_test  = df.iloc[-test_size:]

X_train = df_train[feature_cols];  y_train = df_train['target']
X_test  = df_test[feature_cols];   y_test  = df_test['target']

# 2.2) Treinar Random Forest simples
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics   import accuracy_score

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Acurácia baseline (lags):", accuracy_score(y_test, y_pred))


Acurácia baseline (lags): 0.5666666666666667


In [45]:
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV

tscv = TimeSeriesSplit(n_splits=5)
param_dist = {
    'n_estimators':    [100,200,300],
    'max_depth':       [4,6,8,None],
    'min_samples_split':[2,4,6],
    'max_features':    ['sqrt','log2']
}

rs = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20, cv=tscv,
    scoring='accuracy', n_jobs=-1, random_state=42
)

rs.fit(X_train, y_train)
best_rf = rs.best_estimator_

y_pred = best_rf.predict(X_test)
from sklearn.metrics import classification_report
print(f"Acurácia tunada (lags): {accuracy_score(y_test, y_pred):.2%}")
print(classification_report(y_test, y_pred))
print("Parâmetros ótimos:", rs.best_params_)


Acurácia tunada (lags): 53.33%
              precision    recall  f1-score   support

           0       1.00      0.07      0.12        15
           1       0.52      1.00      0.68        15

    accuracy                           0.53        30
   macro avg       0.76      0.53      0.40        30
weighted avg       0.76      0.53      0.40        30

Parâmetros ótimos: {'n_estimators': 100, 'min_samples_split': 4, 'max_features': 'log2', 'max_depth': 4}


In [46]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1.1) Defina o mesmo X_train, y_train, X_test, y_test dos lags
# já está no seu notebook

# 1.2) Configure a validação temporal
tscv = TimeSeriesSplit(n_splits=5)

# 1.3) Espaço de busca para o XGBoost
param_dist_xgb = {
    'n_estimators':    [50,100,200],
    'max_depth':       [3, 5, 7, 9],
    'learning_rate':   [0.01, 0.05, 0.1],
    'subsample':       [0.6, 0.8, 1.0],
    'colsample_bytree':[0.6, 0.8, 1.0]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# 1.4) RandomizedSearchCV
rs_xgb = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist_xgb,
    n_iter=20,
    cv=tscv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rs_xgb.fit(X_train, y_train)

# 1.5) Avaliar no conjunto de teste
best_xgb = rs_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print(f"Acurácia XGBoost tunado: {accuracy_score(y_test, y_pred_xgb):.2%}\n")
print("Matriz de Confusão:\n", confusion_matrix(y_test, y_pred_xgb), "\n")
print("Relatório de Classificação:\n", classification_report(y_test, y_pred_xgb))
print("Melhores parâmetros XGBoost:", rs_xgb.best_params_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Acurácia XGBoost tunado: 60.00%

Matriz de Confusão:
 [[ 8  7]
 [ 5 10]] 

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.62      0.53      0.57        15
           1       0.59      0.67      0.62        15

    accuracy                           0.60        30
   macro avg       0.60      0.60      0.60        30
weighted avg       0.60      0.60      0.60        30

Melhores parâmetros XGBoost: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 1.0}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [49]:
from sklearn.model_selection import KFold
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# supondo que best_rf_params e best_xgb_params já existam
estimators = [
    ('rf',  RandomForestClassifier(**best_rf_params, random_state=42)),
    ('xgb', XGBClassifier(**best_xgb_params, use_label_encoder=False, eval_metric='logloss', random_state=42))
]

kf = KFold(n_splits=5, shuffle=False)

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=kf,          # agora um CV particionado
    n_jobs=-1
)

stack.fit(X_train, y_train)
y_pred_stack = stack.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(f"Acurácia Stacking: {accuracy_score(y_test, y_pred_stack):.2%}\n")
print("Matriz de Confusão:\n", confusion_matrix(y_test, y_pred_stack))
print("\nRelatório de Classificação:\n", classification_report(y_test, y_pred_stack))


Acurácia Stacking: 50.00%

Matriz de Confusão:
 [[ 1 14]
 [ 1 14]]

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.50      0.07      0.12        15
           1       0.50      0.93      0.65        15

    accuracy                           0.50        30
   macro avg       0.50      0.50      0.38        30
weighted avg       0.50      0.50      0.38        30



In [50]:
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    n_jobs=-1
)
# ou explicitamente: cv=5
# stack = StackingClassifier(..., cv=5, n_jobs=-1)


In [54]:
# 1) Converter Data para datetime (inferência automática)
df['Data'] = pd.to_datetime(df['Data'], dayfirst=True, errors='coerce')

# Verifique se não há conversões que falharam
print(df['Data'].isna().sum(), "linhas sem data válida")

# 2) Agora os atributos de calendário funcionarão
df['weekday'] = df['Data'].dt.weekday    # 0=segunda … 6=domingo
df['month']   = df['Data'].dt.month      # 1 a 12


1473 linhas sem data válida


In [55]:
# 3) Volume: média móvel e variação
df['vol_ma_5']       = df['Vol.'].rolling(window=5).mean()
df['vol_pct_change'] = df['Vol.'].pct_change() * 100

# 4) Bollinger Bands simples (20 dias)
df['bb_upper'] = df['ma_20'] + 2 * df['Último'].rolling(20).std()
df['bb_lower'] = df['ma_20'] - 2 * df['Último'].rolling(20).std()

# 5) Remover NaNs gerados e resetar índice
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)


In [56]:
# 1) Verificar tipos de dado
print(df.dtypes)

# 2) Montar a lista de features
feature_cols = [
    'pct_change',    # variação percentual diária
    'rsi_14',        # RSI 14
    'macd',          # MACD
    'macd_signal',   # sinal MACD
    'macd_diff',     # histograma MACD
    'lag_1',         # fechamento t-1
    'lag_2',         # fechamento t-2
    'lag_3',         # fechamento t-3
    'weekday',       # dia da semana
    'month',         # mês
    'vol_ma_5',      # média móvel volume 5 dias
    'vol_pct_change',# variação percentual do volume
    'bb_upper',      # Bollinger Upper
    'bb_lower'       # Bollinger Lower
]

# 3) Exibir a lista
print("\nFeatures usadas:\n", feature_cols)


Data              datetime64[ns]
Último                   float64
Abertura                 float64
Máxima                   float64
Mínima                   float64
Vol.                     float64
Var%                      object
pct_change               float64
rsi_14                   float64
macd                     float64
macd_signal              float64
macd_diff                float64
lag_1                    float64
lag_2                    float64
lag_3                    float64
target                     int32
ma_5                     float64
ma_10                    float64
ma_20                    float64
close_vs_ma5             float64
close_vs_ma20            float64
diff_ma5_ma10            float64
diff_ma10_ma20           float64
macd_cross               float64
ret_1                    float64
ret_2                    float64
ret_3                    float64
ret_4                    float64
ret_5                    float64
weekday                  float64
month     

In [58]:
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics       import accuracy_score, confusion_matrix, classification_report

# 1) Separar treino/teste
test_size = 30
df_train = df.iloc[:-test_size].copy()
df_test  = df.iloc[-test_size:].copy()

X_train, y_train = df_train[feature_cols], df_train['target']
X_test,  y_test  = df_test[feature_cols],  df_test['target']

# 2) Configurar TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# 3) Definir espaço de busca LightGBM
param_dist_lgb = {
    'n_estimators':     [50, 100, 200, 300],
    'max_depth':        [3, 5, 7, 9, -1],
    'learning_rate':    [0.01, 0.05, 0.1, 0.2],
    'num_leaves':       [15, 31, 63],
    'subsample':        [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

lgb_base = lgb.LGBMClassifier(random_state=42)

# 4) RandomizedSearchCV
rs_lgb = RandomizedSearchCV(
    estimator=lgb_base,
    param_distributions=param_dist_lgb,
    n_iter=30,
    cv=tscv,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# 5) Treinar a busca
rs_lgb.fit(X_train, y_train)

# 6) Avaliar no teste
best_lgb = rs_lgb.best_estimator_
y_pred_lgb = best_lgb.predict(X_test)

print(f"Acurácia LightGBM tunado: {accuracy_score(y_test, y_pred_lgb):.2%}\n")
print("Matriz de Confusão:\n", confusion_matrix(y_test, y_pred_lgb), "\n")
print("Relatório de Classificação:\n", classification_report(y_test, y_pred_lgb))
print("Melhores parâmetros LightGBM:", rs_lgb.best_params_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 489, number of negative: 426
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3080
[LightGBM] [Info] Number of data points in the train set: 915, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.534426 -> initscore=0.137923
[LightGBM] [Info] Start training from score 0.137923
Acurácia LightGBM tunado: 66.67%

Matriz de Confusão:
 [[ 8  6]
 [ 4 12]] 

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.67      0.57      0.62        14
           1       0.67      0.75      0.71        16

    accuracy                           0.67        30
   macro avg       0.67      0.66      0.66        30
weighted avg       0.67      0.67      0.66        30

Melhores parâmetros 

In [59]:
from sklearn.ensemble import VotingClassifier

# Supondo que você já tenha best_lgb e best_xgb do RandomizedSearchCV:
voting = VotingClassifier(
    estimators=[
        ('lgb', best_lgb),
        ('xgb', best_xgb)
    ],
    voting='soft',  # usa probabilidades
    n_jobs=-1
)

# Treine e avalie
voting.fit(X_train, y_train)
y_pred_vote = voting.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f"Acurácia Ensemble: {accuracy_score(y_test, y_pred_vote):.2%}\n")
print("Matriz de Confusão:\n", confusion_matrix(y_test, y_pred_vote), "\n")
print("Relatório de Classificação:\n", classification_report(y_test, y_pred_vote))


Acurácia Ensemble: 50.00%

Matriz de Confusão:
 [[ 5  9]
 [ 6 10]] 

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.45      0.36      0.40        14
           1       0.53      0.62      0.57        16

    accuracy                           0.50        30
   macro avg       0.49      0.49      0.49        30
weighted avg       0.49      0.50      0.49        30



In [60]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

mlp = make_pipeline(
    StandardScaler(),                # normaliza as features
    MLPClassifier(
        hidden_layer_sizes=(64,32),  # duas camadas ocultas
        activation='relu',
        alpha=1e-4,                  # regularização L2
        max_iter=500,
        random_state=42
    )
)

# Treine e avalie
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

print(f"Acurácia MLP: {accuracy_score(y_test, y_pred_mlp):.2%}\n")
print("Relatório de Classificação:\n", classification_report(y_test, y_pred_mlp))



Acurácia MLP: 53.33%

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.50      0.57      0.53        14
           1       0.57      0.50      0.53        16

    accuracy                           0.53        30
   macro avg       0.54      0.54      0.53        30
weighted avg       0.54      0.53      0.53        30





In [61]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 1.1 Defina o tamanho da janela (quantos dias de histórico usar)
window_size = 10

# 1.2 Selecione as colunas que vai usar (exemplo: pct_change + lags + indicadores)
seq_features = feature_cols  # as 14 features que você já montou

# 1.3 Escalar os dados entre 0 e 1 (importante para redes neurais)
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[seq_features]),
                         columns=seq_features)

# 1.4 Criar X, y como sequências
X_seq, y_seq = [], []
for i in range(window_size, len(df_scaled)-1):
    X_seq.append(df_scaled.iloc[i-window_size:i].values)
    y_seq.append(df['target'].iloc[i])  # target do dia i

X_seq = np.array(X_seq)   # shape = (samples, window_size, n_features)
y_seq = np.array(y_seq)


In [62]:
# Vamos usar os últimos 30 exemplos como teste
test_samples = 30
X_train_seq, X_test_seq = X_seq[:-test_samples], X_seq[-test_samples:]
y_train_seq, y_test_seq = y_seq[:-test_samples], y_seq[-test_samples:]

print(X_train_seq.shape, X_test_seq.shape)  # conferir formatos


(904, 10, 14) (30, 10, 14)


In [64]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

n_features = X_train_seq.shape[2]

model = Sequential([
    LSTM(64, input_shape=(window_size, n_features), return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

es = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_seq,
    epochs=50,
    batch_size=16,
    validation_split=0.1,
    callbacks=[es],
    verbose=2
)


Epoch 1/50


  super().__init__(**kwargs)


51/51 - 3s - 63ms/step - accuracy: 0.5043 - loss: 0.6959 - val_accuracy: 0.5824 - val_loss: 0.6825
Epoch 2/50
51/51 - 0s - 7ms/step - accuracy: 0.5203 - loss: 0.6940 - val_accuracy: 0.5824 - val_loss: 0.6825
Epoch 3/50
51/51 - 0s - 7ms/step - accuracy: 0.5117 - loss: 0.6945 - val_accuracy: 0.5824 - val_loss: 0.6898
Epoch 4/50
51/51 - 0s - 7ms/step - accuracy: 0.5092 - loss: 0.6945 - val_accuracy: 0.5824 - val_loss: 0.6863
Epoch 5/50
51/51 - 0s - 7ms/step - accuracy: 0.4822 - loss: 0.6959 - val_accuracy: 0.5824 - val_loss: 0.6851
Epoch 6/50
51/51 - 0s - 7ms/step - accuracy: 0.5301 - loss: 0.6932 - val_accuracy: 0.5824 - val_loss: 0.6902


In [65]:
loss, acc = model.evaluate(X_test_seq, y_test_seq, verbose=0)
print(f"Acurácia LSTM: {acc:.2%}")

# Se quiser matriz de confusão:
y_pred_seq = (model.predict(X_test_seq) > 0.5).astype(int).flatten()
from sklearn.metrics import confusion_matrix, classification_report
print("Matriz de Confusão:\n", confusion_matrix(y_test_seq, y_pred_seq))
print("\nRelatório de Classificação:\n", classification_report(y_test_seq, y_pred_seq))


Acurácia LSTM: 56.67%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step
Matriz de Confusão:
 [[ 0 13]
 [ 0 17]]

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.57      1.00      0.72        17

    accuracy                           0.57        30
   macro avg       0.28      0.50      0.36        30
weighted avg       0.32      0.57      0.41        30



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [67]:
import kerastuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

def build_lstm_model(hp):
    window = hp.Int('window_size', min_value=5, max_value=20, step=5)
    n_layers = hp.Int('n_layers', 1, 3)
    lr = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    dropout_rate = hp.Float('dropout', 0.1, 0.5, step=0.1)
    
    model = Sequential()
    for i in range(n_layers):
        return_seq = (i < n_layers - 1)
        units = hp.Int(f'units_{i}', min_value=16, max_value=128, step=16)
        if i == 0:
            model.add(LSTM(units, 
                           input_shape=(window, len(feature_cols)), 
                           return_sequences=return_seq))
        else:
            model.add(LSTM(units, return_sequences=return_seq))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model


In [68]:
def make_sequences(df, feature_cols, window):
    data = df[feature_cols].values
    targets = df['target'].values
    X, y = [], []
    for i in range(window, len(data)-1):
        X.append(data[i-window:i])
        y.append(targets[i])
    return np.array(X), np.array(y)


In [70]:
from kerastuner.tuners import RandomSearch

tuner = kt.RandomSearch(
    build_lstm_model,
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=1,
    directory='lstm_tuning',
    project_name='ibovespa'
)


In [75]:
# 1.1 – Depois de criar dollar['pct_change_usd'], extraia só a Series:
pct_usd = dollar['pct_change_usd']  # isso garante que seja um objeto de nível único

# 1.2 – Mapeie pela data
df['pct_change_usd'] = df['Data'].map(pct_usd)

# Pronto: toda vez que df['Data'] coincidir com um índice de pct_usd, ele traz o valor,
# senão fica NaN.


In [79]:
import yfinance as yf

# Baixar série do Dólar
dollar = yf.download("USDBRL=X", start="2018-01-01", end="2025-06-17", interval="1d")
dollar['pct_change_usd'] = dollar['Close'].pct_change() * 100

# Merge no seu df principal via Data
df = df.merge(dollar[['pct_change_usd']], left_on='Data', right_index=True, how='left')


  dollar = yf.download("USDBRL=X", start="2018-01-01", end="2025-06-17", interval="1d")
[*********************100%***********************]  1 of 1 completed


MergeError: Not allowed to merge between different levels. (1 levels on the left, 2 on the right)