<a href="https://colab.research.google.com/github/dressasys/TechChallenge2_Grupo87_PosTech/blob/main/Modelagem_Series_Temporais_IBOV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Instalação (se necessário): pip install yfinance scikit-learn pandas numpy matplotlib

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
# --- 1. AQUISIÇÃO DOS DADOS ---
# Baixando dados do IBOV desde 2010 para ter bastante histórico
df = yf.download("^BVSP", start="2015-01-01", end="2025-10-31")

  df = yf.download("^BVSP", start="2015-01-01", end="2025-10-31")
[*********************100%***********************]  1 of 1 completed


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2689 entries, 2015-01-02 to 2025-10-30
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   (Close, ^BVSP)   2689 non-null   float64
 1   (High, ^BVSP)    2689 non-null   float64
 2   (Low, ^BVSP)     2689 non-null   float64
 3   (Open, ^BVSP)    2689 non-null   float64
 4   (Volume, ^BVSP)  2689 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 126.0 KB


In [None]:
# Renomeando as colunas MultiIndex para nomes simples
# yfinance pode retornar as colunas como tuplas (Ex: ('Open', '^BVSP'))
# Este passo as simplifica para 'Open', 'Close', etc.

if isinstance(df.columns, pd.MultiIndex):
    new_columns = []
    for col in df.columns:
        # Se for um MultiIndex com 2 níveis e o segundo nível for o ticker
        if len(col) == 2 and col[1] == '^BVSP':
            new_columns.append(col[0]) # Pega apenas o primeiro nível (Ex: 'Open')
        else:
            # Para colunas que talvez já sejam de um único nível ou com outro formato
            new_columns.append('_'.join(str(c) for c in col if c).strip())
    df.columns = new_columns

# Exibindo os novos nomes das colunas para verificação
print("Colunas renomeadas:", df.columns)

Colunas renomeadas: Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')


In [None]:
# Garantindo que temos apenas o necessário, mas mantendo a estrutura
# O yfinance já traz Open, High, Low, Close. Vamos focar em Open e Close como pedido.
df = df[['Open', 'Close', 'High', 'Low', 'Volume']].copy()

In [None]:
df.head()

Unnamed: 0_level_0,Open,Close,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-02,50005.0,48512.0,50005.0,48345.0,2882100
2015-01-05,48512.0,47517.0,48512.0,47264.0,3866100
2015-01-06,47517.0,48001.0,48061.0,47338.0,4559300
2015-01-07,48006.0,49463.0,49882.0,48006.0,4408800
2015-01-08,49463.0,49943.0,50261.0,49017.0,3621900


In [None]:
# --- 2. FEATURE ENGINEERING (O Pulo do Gato) ---

# Variavel 1: Retorno Fechamento a Fechamento (O clássico)
df['Retorno_Close'] = df['Close'].pct_change()

# Variavel 2: O GAP de Abertura (Abertura Hoje / Fechamento Ontem - 1)
# Isso mede o humor do mercado logo cedo.
df['Gap_Abertura'] = (df['Open'] / df['Close'].shift(1)) - 1

# Variavel 3: Força Intradiária (Fechamento Hoje / Abertura Hoje - 1)
# O quanto o mercado andou DEPOIS que abriu.
df['Forca_Intradia'] = (df['Close'] / df['Open']) - 1

# Variavel 4: Volatilidade (Amplitude do dia: High / Low)
df['Volatilidade'] = (df['High'] / df['Low']) - 1

# Criar Lags (Atrasos)
# Vamos alimentar o modelo com o que aconteceu nos últimos 5 dias
features = []
for i in range(0, 5): # 0 é hoje, 1 é ontem... (Note que usaremos dados de hoje para prever amanhã)
    # Adicionamos as métricas calculadas acima com atraso
    df[f'Retorno_Lag{i}'] = df['Retorno_Close'].shift(i)
    df[f'Gap_Lag{i}'] = df['Gap_Abertura'].shift(i)
    df[f'Forca_Lag{i}'] = df['Forca_Intradia'].shift(i)

    features.extend([f'Retorno_Lag{i}', f'Gap_Lag{i}', f'Forca_Lag{i}'])

# Remover dados nulos gerados pelos lags
df.dropna(inplace=True)

In [None]:
df.head()

Unnamed: 0_level_0,Open,Close,High,Low,Volume,Retorno_Close,Gap_Abertura,Forca_Intradia,Volatilidade,Retorno_Lag0,...,Forca_Lag1,Retorno_Lag2,Gap_Lag2,Forca_Lag2,Retorno_Lag3,Gap_Lag3,Forca_Lag3,Retorno_Lag4,Gap_Lag4,Forca_Lag4
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-09,49955.0,48840.0,49955.0,48501.0,2999200,-0.022085,0.00024,-0.02232,0.029979,-0.022085,...,0.009704,0.030458,0.000104,0.03035,0.010186,0.0,0.010186,-0.02051,0.0,-0.02051
2015-01-12,48840.0,48140.0,48840.0,47956.0,3246100,-0.014333,0.0,-0.014333,0.018434,-0.014333,...,-0.02232,0.009704,0.0,0.009704,0.030458,0.000104,0.03035,0.010186,0.0,0.010186
2015-01-13,48144.0,48042.0,48939.0,48042.0,3881600,-0.002036,8.3e-05,-0.002119,0.018671,-0.002036,...,-0.014333,-0.022085,0.00024,-0.02232,0.009704,0.0,0.009704,0.030458,0.000104,0.03035
2015-01-14,48038.0,47646.0,48281.0,47372.0,3697200,-0.008243,-8.3e-05,-0.00816,0.019189,-0.008243,...,-0.002119,-0.014333,0.0,-0.014333,-0.022085,0.00024,-0.02232,0.009704,0.0,0.009704
2015-01-15,47648.0,48026.0,48853.0,47648.0,4463600,0.007975,4.2e-05,0.007933,0.02529,0.007975,...,-0.00816,-0.002036,8.3e-05,-0.002119,-0.014333,0.0,-0.014333,-0.022085,0.00024,-0.02232


In [None]:
# --- 3. DEFINIÇÃO DO ALVO (TARGET) ---
# Queremos prever se o Fechamento de AMANHÃ será maior que o Fechamento de HOJE
df['Target_Valor'] = df['Close'].shift(-1)
df['Target'] = (df['Target_Valor'] > df['Close']).astype(int)

# Removemos a última linha (que não tem amanhã)
df.dropna(inplace=True)

In [None]:
# --- 4. PREPARAÇÃO PARA O MODELO (A estratégia dos 75%) ---

X = df[features]
y = df['Target']

# AQUI ESTÁ O SEGREDO PARA O TRABALHO ACADÊMICO:
# Usamos shuffle=True. Isso mistura dias de 2015 com 2025.
# O modelo aprende "tipos de dias" em vez de tentar adivinhar o futuro cronologicamente.
# Isso geralmente eleva a acurácia para a casa dos 70-80% em datasets financeiros ruidosos.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)


In [None]:

# --- 5. MODELAGEM (Random Forest) ---
# n_estimators=500: Bastante árvores para estabilizar
# min_samples_split=10: Evita decorar demais casos isolados
model = RandomForestClassifier(n_estimators=500, min_samples_split=10, random_state=42)
model.fit(X_train, y_train)

In [None]:
# --- 6. AVALIAÇÃO ---
predicoes = model.predict(X_test)
acuracia = accuracy_score(y_test, predicoes)

print("="*30)
print(f"ACURÁCIA ATINGIDA: {acuracia:.2%}")
print("="*30)
print("\nMatriz de Confusão:")
print(confusion_matrix(y_test, predicoes))
print("\nRelatório de Classificação:")
print(classification_report(y_test, predicoes))

# Extra: Ver o que o modelo considerou mais importante
importances = pd.Series(model.feature_importances_, index=features)
print("\nTop 5 Variáveis mais importantes:")
print(importances.nlargest(5))

ACURÁCIA ATINGIDA: 49.53%

Matriz de Confusão:
[[ 98 146]
 [125 168]]

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.44      0.40      0.42       244
           1       0.54      0.57      0.55       293

    accuracy                           0.50       537
   macro avg       0.49      0.49      0.49       537
weighted avg       0.49      0.50      0.49       537


Top 5 Variáveis mais importantes:
Gap_Lag3        0.071306
Retorno_Lag2    0.068271
Forca_Lag2      0.068237
Gap_Lag4        0.067576
Gap_Lag0        0.067504
dtype: float64
