<a href="https://colab.research.google.com/github/gabrxelle/FIAP-Tech-Challenge---Modelo-Preditivo-IBOV/blob/main/Tech_Challenge_Ibovespa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importando bibliotecas


In [1]:
import pandas as pd # Manipulação e análise de dados.
import numpy as np  # Operações matemáticas e vetoriais
import matplotlib.pyplot as plt # Criação de gráficos e visualizações estáticas.
import seaborn as sns # Visualização de dados estatísticos.

# Importações para Testes Estatísticos
from scipy.stats import shapiro # Teste de normalidade.
from scipy.stats import mannwhitneyu # Teste não-paramétrico.

# Importações para Avaliação de Performance do Modelo
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
# accuracy_score: Porcentagem de acertos totais
# classification_report: Métricas detalhadas (Precision, Recall, F1-Score)
# confusion_matrix: Tabela para ver erros de Falso Positivo e Falso Negativo
# roc_curve/auc: Mede a capacidade do modelo de distinguir entre as classes (Sobe/Desce)

# Algoritmos de Machine Learning
from sklearn.naive_bayes import GaussianNB # Modelo baseado em probabilidades.
from sklearn.ensemble import RandomForestClassifier # Conjunto de árvores de decisão.
from sklearn.neighbors import KNeighborsClassifier # Classificação baseada em proximidade/vizinhos (KNN).

# Ferramentas para Visualização de Árvores e Importância de Variáveis
from sklearn.tree import plot_tree # Desenha a estrutura da árvore de decisão
from sklearn import tree # Ferramentas genéricas para árvores
from sklearn.inspection import permutation_importance # Calcula quais colunas (features) são mais importantes para o modelo

# Primeira análise


In [2]:
df = pd.read_csv("Dados Históricos - Ibovespa (1).csv")

In [3]:
df.head(10)

Unnamed: 0,Data,Último,Abertura,Máxima,Mínima,Vol.,Var%
0,05.04.2016,49.054,48.778,49.629,48.149,"3,66M","0,56%"
1,04.04.2016,48.78,50.556,50.556,48.6,"3,67M","-3,52%"
2,01.04.2016,50.562,50.054,50.768,49.361,"4,01M","1,01%"
3,31.03.2016,50.055,51.248,51.248,49.642,"4,38M","-2,33%"
4,30.03.2016,51.249,51.155,52.262,50.9,"4,88M","0,18%"
5,29.03.2016,51.155,50.839,51.765,50.387,"4,45M","0,62%"
6,28.03.2016,50.838,49.687,51.149,49.687,"3,62M","2,38%"
7,24.03.2016,49.657,49.686,49.686,48.778,"3,80M","-0,07%"
8,23.03.2016,49.69,51.005,51.005,49.491,"3,75M","-2,59%"
9,22.03.2016,51.01,51.17,51.215,50.812,"4,02M","-0,32%"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Data      5000 non-null   object 
 1   Último    5000 non-null   float64
 2   Abertura  5000 non-null   float64
 3   Máxima    5000 non-null   float64
 4   Mínima    5000 non-null   float64
 5   Vol.      4999 non-null   object 
 6   Var%      5000 non-null   object 
dtypes: float64(4), object(3)
memory usage: 273.6+ KB


In [5]:
df.isnull().sum()

Unnamed: 0,0
Data,0
Último,0
Abertura,0
Máxima,0
Mínima,0
Vol.,1
Var%,0


In [6]:
df = df[~df['Vol.'].isnull()]

# Convertendo valores de Volume

In [7]:
set(df['Vol.'].str[-1])

{'B', 'K', 'M'}

In [8]:
def converter_volume(vol_str):
    vol_str = str(vol_str).strip().upper()
    if 'M' in vol_str:
        return float(vol_str.replace('M', '').replace(',', '.')) * 1_000_000
    elif 'K' in vol_str:
        return float(vol_str.replace('K', '').replace(',', '.')) * 1_000
    elif 'B' in vol_str:
        return float(vol_str.replace('B', '').replace(',', '.')) * 1_000_000_000
    return float(vol_str)

for col in ['Último', 'Abertura', 'Máxima', 'Mínima']:
    df[col] = df[col].astype(str).str.replace('.', '').str.replace(',', '.').astype(float)

if 'Vol.' in df.columns:
    df['Volume'] = df['Vol.'].apply(converter_volume)
else:
    print("Aviso: Erro. Features não criadas.")
    df['Volume'] = 0

# Criando coluna DATA

In [9]:
df['Data'] = pd.to_datetime(df['Data'], dayfirst=True)

df['MesAno'] = pd.to_datetime(df['Data'].dt.strftime('%Y-%m'))

df['Ano'] = df['Data'].dt.year

df.set_index('Data', inplace=True)
df.sort_index(inplace=True)

# Definindo o Target

In [10]:
df['Fechamento_Amanhã'] = df['Último'].shift(-1)
df['Previsão'] = (df['Fechamento_Amanhã'] > df['Último']).astype(int)
df.head()

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Vol.,Var%,Volume,MesAno,Ano,Fechamento_Amanhã,Previsão
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1996-01-22,4894.0,4909.0,4922.0,4838.0,"9,47B","-0,31%",9470000000.0,1996-01-01,1996,4985.0,1
1996-01-23,4985.0,4897.0,4985.0,4897.0,"18,90B","1,86%",18900000000.0,1996-01-01,1996,4945.0,0
1996-01-24,4945.0,4986.0,502.0,492.0,"13,36B","-0,80%",13360000000.0,1996-01-01,1996,4908.0,0
1996-01-26,4908.0,4945.0,4945.0,4832.0,"14,25B","-0,75%",14250000000.0,1996-01-01,1996,4935.0,1
1996-01-29,4935.0,4908.0,4948.0,4875.0,"10,49B","0,55%",10490000000.0,1996-01-01,1996,5133.0,1


In [11]:
df['Var%'] = df['Var%'].str.replace('%', '').str.replace(',', '.').astype(float) / 100

In [12]:
df.describe()

Unnamed: 0,Último,Abertura,Máxima,Mínima,Var%,Volume,MesAno,Ano,Fechamento_Amanhã,Previsão
count,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999,4999.0,4998.0,4999.0
mean,31427.021004,31160.185237,31740.543109,30997.231446,0.000678,1574635000.0,2006-02-08 20:44:24.532906624,2005.652531,31432.329732,0.519904
min,80.0,78.0,54.0,54.0,-0.1581,112100.0,1996-01-01 00:00:00,1996.0,80.0,0.0
25%,10815.0,10803.0,10971.0,10714.0,-0.0101,3050000.0,2001-02-01 00:00:00,2001.0,10819.75,0.0
50%,24829.0,24376.0,25001.0,24328.0,0.0009,112580000.0,2006-02-01 00:00:00,2006.0,24836.5,1.0
75%,53721.0,53354.5,54120.0,52943.5,0.0118,326200000.0,2011-03-01 00:00:00,2011.0,53725.0,1.0
max,73517.0,73508.0,73794.0,72534.0,0.3342,54060000000.0,2016-04-01 00:00:00,2016.0,73517.0,1.0
std,22597.327369,22424.764167,22731.650207,22262.841225,0.020896,3883686000.0,,5.831423,22596.47053,0.499654


In [13]:
df_agrupado_mes = df.groupby('MesAno')[['Último','Abertura','Máxima','Mínima']].mean()
df_agrupado_mes.head()

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima
MesAno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996-01-01,4993.142857,4959.571429,4374.571429,4279.428571
1996-02-01,5271.555556,5282.333333,4529.0,4442.777778
1996-03-01,4286.857143,4287.0,4349.857143,4023.095238
1996-04-01,4568.55,4558.1,4592.45,4746.1
1996-05-01,4811.727273,4786.363636,4825.136364,4753.590909


In [14]:
df.groupby('Ano').max()

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Vol.,Var%,Volume,MesAno,Fechamento_Amanhã,Previsão
Ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1996,7007.0,7007.0,7052.0,7007.0,"9,97B",0.0401,54060000000.0,1996-12-01,7007.0,1
1997,13617.0,13617.0,14005.0,13567.0,"9,97B",0.097,33980000000.0,1997-12-01,13617.0,1
1998,12299.0,12299.0,12339.0,12121.0,"9,91B",0.1868,26550000000.0,1998-12-01,12299.0,1
1999,17092.0,16778.0,17105.0,16778.0,"991,45M",0.3342,991450000.0,1999-12-01,17092.0,1
2000,18951.0,18952.0,19047.0,18533.0,"988,22M",0.05,988220000.0,2000-12-01,18951.0,1
2001,17889.0,17906.0,18023.0,17704.0,"989,42M",0.0761,989420000.0,2001-12-01,17889.0,1
2002,14471.0,14455.0,14495.0,14273.0,"634,08M",0.0634,634080000.0,2002-12-01,14471.0,1
2003,22236.0,22051.0,22046.0,22051.0,"989,48M",0.0362,989480000.0,2003-12-01,22445.0,1
2004,26196.0,26171.0,26245.0,26108.0,"994,96M",0.053,994960000.0,2004-12-01,26196.0,1
2005,33629.0,33625.0,33837.0,33463.0,"99,53M",0.0455,302530000.0,2005-12-01,33629.0,1


In [15]:
df.groupby('Ano').min()

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Vol.,Var%,Volume,MesAno,Fechamento_Amanhã,Previsão
Ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1996,489.0,489.0,55.0,54.0,"10,03B",-0.0464,2320000000.0,1996-01-01,489.0,0
1997,87.0,87.0,100.0,98.0,"10,02B",-0.1497,2130000000.0,1997-01-01,87.0,0
1998,80.0,78.0,54.0,84.0,"10,01B",-0.1581,2720000000.0,1998-01-01,80.0,0
1999,117.0,92.0,71.0,108.0,"101,79M",-0.0997,101790000.0,1999-01-01,117.0,0
2000,145.0,161.0,147.0,149.0,"100,65M",-0.0637,100650000.0,2000-01-01,145.0,0
2001,117.0,130.0,156.0,148.0,"107,29M",-0.0918,107290000.0,2001-01-01,117.0,0
2002,126.0,92.0,101.0,141.0,"108,39M",-0.0653,108390000.0,2002-01-01,126.0,0
2003,116.0,135.0,108.0,116.0,"113,34M",-0.0387,113340000.0,2003-01-01,116.0,0
2004,194.0,226.0,218.0,223.0,"101,26M",-0.0615,101260000.0,2004-01-01,194.0,0
2005,259.0,250.0,282.0,2368.0,"100,42M",-0.0415,22000000.0,2005-01-01,259.0,0


In [16]:
df.groupby('Ano').mean(numeric_only=True)

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Var%,Volume,Fechamento_Amanhã,Previsão
Ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1996,5528.162393,5546.269231,5540.833333,5375.192308,0.001639,11455260000.0,5536.974359,0.534188
1997,9618.514056,9591.02008,9271.614458,9116.795181,0.001914,9703976000.0,9594.787149,0.60241
1998,8172.727642,8292.813008,8532.947154,8553.48374,-0.000995,8921748000.0,8196.686992,0.50813
1999,10140.50813,10167.597561,9811.060976,9811.902439,0.004207,539297800.0,10119.170732,0.528455
2000,14902.758065,15042.467742,15063.040323,14410.822581,-0.000241,348486400.0,14958.129032,0.483871
2001,13112.569106,13190.670732,13117.890244,12622.922764,-0.000246,236950300.0,13106.256098,0.49187
2002,10067.493976,10379.080321,10471.248996,10520.433735,-0.000539,269794100.0,10058.381526,0.477912
2003,12411.348,12773.232,13033.012,12951.712,0.002834,318145600.0,12454.716,0.564
2004,19486.506024,19766.746988,19895.297189,20604.172691,0.000818,292803500.0,19499.666667,0.554217
2005,24702.080321,25100.305221,25422.895582,24542.285141,0.001107,116032000.0,24733.345382,0.518072


In [17]:
def remover_extremos_exato(grupo):
    grupo = grupo.drop(grupo['Máxima'].idxmax())
    grupo = grupo.drop(grupo['Mínima'].idxmin())
    return grupo

df_sem_extremos = df.groupby('Ano', group_keys=False).apply(remover_extremos_exato)

median_sem_extremos = df_sem_extremos.groupby('Ano').median(numeric_only=True)
mean_sem_extremos = df_sem_extremos.groupby('Ano').mean(numeric_only=True)

median_sem_extremos

  df_sem_extremos = df.groupby('Ano', group_keys=False).apply(remover_extremos_exato)


Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Var%,Volume,Fechamento_Amanhã,Previsão
Ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1996,6109.5,6089.0,6198.0,6041.0,0.0012,10270000000.0,6109.5,1.0
1997,9858.0,9858.0,9838.0,9633.0,0.0053,9110000000.0,9858.0,1.0
1998,9367.0,9426.0,9584.5,9432.5,-0.0001,8430000000.0,9367.0,1.0
1999,11053.0,11074.5,11155.5,11005.0,0.00255,552335000.0,11069.5,1.0
2000,16321.0,16356.0,16437.5,16025.0,-0.0016,194765000.0,16321.0,0.0
2001,13900.0,13975.5,14077.0,13734.5,-0.00075,216400000.0,13882.5,0.0
2002,10583.0,10663.0,10735.0,10569.0,-0.0008,255780000.0,10583.0,0.0
2003,13110.0,13146.0,13333.0,13120.0,0.0032,299390000.0,13121.5,1.0
2004,22171.0,22142.0,22348.0,21942.0,0.0014,269590000.0,22178.0,1.0
2005,26257.0,26385.0,26705.0,26129.0,0.0014,111850000.0,26298.0,1.0


In [18]:
df['Fechamento_Amanhã'] = df['Último'].shift(-1)
df['Class'] = (df['Fechamento_Amanhã'] > df['Último']).astype(int)

In [19]:
correlation_matrix = df[['Último', 'Abertura','Máxima','Mínima','Var%','Volume']].corr(numeric_only=True)

# FEATURE ENGINING

In [20]:
df['Variação_Dia'] = df['Último'] - df['Abertura']
df.head()

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Vol.,Var%,Volume,MesAno,Ano,Fechamento_Amanhã,Previsão,Class,Variação_Dia
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1996-01-22,4894.0,4909.0,4922.0,4838.0,"9,47B",-0.0031,9470000000.0,1996-01-01,1996,4985.0,1,1,-15.0
1996-01-23,4985.0,4897.0,4985.0,4897.0,"18,90B",0.0186,18900000000.0,1996-01-01,1996,4945.0,0,0,88.0
1996-01-24,4945.0,4986.0,502.0,492.0,"13,36B",-0.008,13360000000.0,1996-01-01,1996,4908.0,0,0,-41.0
1996-01-26,4908.0,4945.0,4945.0,4832.0,"14,25B",-0.0075,14250000000.0,1996-01-01,1996,4935.0,1,1,-37.0
1996-01-29,4935.0,4908.0,4948.0,4875.0,"10,49B",0.0055,10490000000.0,1996-01-01,1996,5133.0,1,1,27.0


In [21]:
df['Fechamento_Anterior'] = df['Último'].shift(1)
df.head()

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Vol.,Var%,Volume,MesAno,Ano,Fechamento_Amanhã,Previsão,Class,Variação_Dia,Fechamento_Anterior
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1996-01-22,4894.0,4909.0,4922.0,4838.0,"9,47B",-0.0031,9470000000.0,1996-01-01,1996,4985.0,1,1,-15.0,
1996-01-23,4985.0,4897.0,4985.0,4897.0,"18,90B",0.0186,18900000000.0,1996-01-01,1996,4945.0,0,0,88.0,4894.0
1996-01-24,4945.0,4986.0,502.0,492.0,"13,36B",-0.008,13360000000.0,1996-01-01,1996,4908.0,0,0,-41.0,4985.0
1996-01-26,4908.0,4945.0,4945.0,4832.0,"14,25B",-0.0075,14250000000.0,1996-01-01,1996,4935.0,1,1,-37.0,4945.0
1996-01-29,4935.0,4908.0,4948.0,4875.0,"10,49B",0.0055,10490000000.0,1996-01-01,1996,5133.0,1,1,27.0,4908.0


In [22]:
df['Média_5_Dias'] = df['Último'].rolling(5).mean()
df.head()

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Vol.,Var%,Volume,MesAno,Ano,Fechamento_Amanhã,Previsão,Class,Variação_Dia,Fechamento_Anterior,Média_5_Dias
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1996-01-22,4894.0,4909.0,4922.0,4838.0,"9,47B",-0.0031,9470000000.0,1996-01-01,1996,4985.0,1,1,-15.0,,
1996-01-23,4985.0,4897.0,4985.0,4897.0,"18,90B",0.0186,18900000000.0,1996-01-01,1996,4945.0,0,0,88.0,4894.0,
1996-01-24,4945.0,4986.0,502.0,492.0,"13,36B",-0.008,13360000000.0,1996-01-01,1996,4908.0,0,0,-41.0,4985.0,
1996-01-26,4908.0,4945.0,4945.0,4832.0,"14,25B",-0.0075,14250000000.0,1996-01-01,1996,4935.0,1,1,-37.0,4945.0,
1996-01-29,4935.0,4908.0,4948.0,4875.0,"10,49B",0.0055,10490000000.0,1996-01-01,1996,5133.0,1,1,27.0,4908.0,4933.4


In [23]:
df['Retorno_Pct'] = df['Último'].pct_change().shift(-1)
df['Previsão'] = (df['Retorno_Pct'] > 0.005).astype(int)

In [24]:
df = df.dropna()
df.head()

Unnamed: 0_level_0,Último,Abertura,Máxima,Mínima,Vol.,Var%,Volume,MesAno,Ano,Fechamento_Amanhã,Previsão,Class,Variação_Dia,Fechamento_Anterior,Média_5_Dias,Retorno_Pct
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1996-01-29,4935.0,4908.0,4948.0,4875.0,"10,49B",0.0055,10490000000.0,1996-01-01,1996,5133.0,1,1,27.0,4908.0,4933.4,0.040122
1996-01-30,5133.0,4938.0,5148.0,4938.0,"22,47B",0.0401,22470000000.0,1996-01-01,1996,5152.0,0,1,195.0,4935.0,4981.2,0.003702
1996-01-31,5152.0,5134.0,5172.0,5084.0,"15,13B",0.0037,15130000000.0,1996-01-01,1996,5269.0,1,1,18.0,5133.0,5014.6,0.02271
1996-02-01,5269.0,5152.0,5292.0,5098.0,"21,30B",0.0227,21300000000.0,1996-02-01,1996,5411.0,1,1,117.0,5152.0,5079.4,0.02695
1996-02-02,5411.0,5269.0,5411.0,5257.0,"26,30B",0.027,26300000000.0,1996-02-01,1996,5333.0,0,0,142.0,5269.0,5180.0,-0.014415


In [25]:
df = df[['Último', 'Abertura', 'Variação_Dia', 'Fechamento_Anterior', 'Média_5_Dias', 'Retorno_Pct', 'Fechamento_Amanhã', 'Previsão']]
df.head()

Unnamed: 0_level_0,Último,Abertura,Variação_Dia,Fechamento_Anterior,Média_5_Dias,Retorno_Pct,Fechamento_Amanhã,Previsão
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1996-01-29,4935.0,4908.0,27.0,4908.0,4933.4,0.040122,5133.0,1
1996-01-30,5133.0,4938.0,195.0,4935.0,4981.2,0.003702,5152.0,0
1996-01-31,5152.0,5134.0,18.0,5133.0,5014.6,0.02271,5269.0,1
1996-02-01,5269.0,5152.0,117.0,5152.0,5079.4,0.02695,5411.0,1
1996-02-02,5411.0,5269.0,142.0,5269.0,5180.0,-0.014415,5333.0,0


# MODELOS DE CLASSIFICAÇÃO

## RANDOM FOREST CLASSIFIER

In [26]:
x = df[['Variação_Dia', 'Fechamento_Anterior', 'Média_5_Dias']]
y = df['Previsão']

In [27]:
x_treino = x[:-30]
y_treino = y[:-30]

In [28]:
x_teste = x[-30:]
y_teste = y[-30:]

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

modelo = RandomForestClassifier(class_weight='balanced', random_state=30)
modelo.fit(x_treino, y_treino)

previsoes = modelo.predict(x_teste)

In [30]:
acuracia = accuracy_score(y_teste, previsoes)
print(f"Acurácia: {acuracia:.2%}")

Acurácia: 70.00%


### VALIDANDO MODELO

In [31]:
probs_modelo = modelo.predict_proba(x_teste)[:, 1]
accuracy_modelo = accuracy_score(y_teste, previsoes)
reports_modelo = classification_report(y_teste, previsoes)
matriz_modelo = confusion_matrix(y_teste, previsoes)
fpr_modelo, tpr_modelo, _ = roc_curve(y_teste, probs_modelo)
roc_auc = auc(fpr_modelo, tpr_modelo)
importances_modelo = modelo.feature_importances_

In [32]:
print(reports_modelo)

              precision    recall  f1-score   support

           0       0.67      0.88      0.76        16
           1       0.78      0.50      0.61        14

    accuracy                           0.70        30
   macro avg       0.72      0.69      0.68        30
weighted avg       0.72      0.70      0.69        30



## NAIVE BAYES

In [33]:
x_treino = x[:-30]
y_treino = y[:-30]

x_teste = x[-30:]
y_teste = y[-30:]

In [34]:
gnb = GaussianNB()
gnb.fit(x_treino, y_treino)

In [35]:
y_pred_gnb = gnb.predict(x_teste)

In [36]:
accuracy_gnb = accuracy_score(y_teste, y_pred_gnb)

print('Acurracy: {:.2f}'.format(round(accuracy_gnb, 2)))

Acurracy: 0.67


### VALIDANDO MODELO

In [37]:
# Métricas de validação
probs_gnb = gnb.predict_proba(x_teste)[:, 1]
accuracy_gnb = accuracy_score(y_teste, y_pred_gnb)
reports_gnb = classification_report(y_teste, y_pred_gnb)
matriz_gnb = confusion_matrix(y_teste, y_pred_gnb)
fpr_gnb, tpr_gnb, _ = roc_curve(y_teste, probs_gnb)
roc_auc_gnb = auc(fpr_gnb, tpr_gnb)
result_gnb = permutation_importance(gnb, x_teste, y_teste, n_repeats=30, random_state=42, n_jobs=-1)
importances_gnb = result_gnb.importances_mean

In [38]:
print(reports_gnb)

              precision    recall  f1-score   support

           0       0.62      0.94      0.75        16
           1       0.83      0.36      0.50        14

    accuracy                           0.67        30
   macro avg       0.73      0.65      0.62        30
weighted avg       0.72      0.67      0.63        30



## KNEIGHBORS

In [39]:
x_treino = x[:-30]
y_treino = y[:-30]

x_teste = x[-30:]
y_teste = y[-30:]

In [40]:
knn = KNeighborsClassifier(n_neighbors=10, metric='euclidean', weights='distance')
knn.fit(x_treino, y_treino)

In [41]:
y_pred_knn = knn.predict(x_teste)

In [42]:
accuracy_knn = accuracy_score(y_teste, y_pred_knn)

print('Acurracy: {:.2f}'.format(round(accuracy_knn, 2)))

Acurracy: 0.67


### VALIDANDO MODELO

In [43]:
probs_knn = knn.predict_proba(x_teste)[:, 1]
accuracy_knn = accuracy_score(y_teste, y_pred_knn)
reports_knn = classification_report(y_teste, y_pred_knn)
matriz_knn = confusion_matrix(y_teste, y_pred_knn)
fpr_knn, tpr_knn, _ = roc_curve(y_teste, probs_knn)
roc_auc_knn = auc(fpr_knn, tpr_knn)
result_knn = permutation_importance(knn, x_teste, y_teste, n_repeats=30, random_state=42, n_jobs=-1)
importances_knn = result_knn.importances_mean

In [44]:
print(reports_knn)

              precision    recall  f1-score   support

           0       0.62      0.94      0.75        16
           1       0.83      0.36      0.50        14

    accuracy                           0.67        30
   macro avg       0.73      0.65      0.62        30
weighted avg       0.72      0.67      0.63        30



# ESCOLHENDO MELHOR MODELO

In [45]:
dict_models = {
    'GNB':accuracy_gnb,
    'KNN':accuracy_knn,
    'RF':acuracia
}

In [46]:
# Selecionando o modelo com a maior acurácia dentro do dicionário
MelhorModelo = max(dict_models, key=dict_models.get)

# Exibindo os resultados individuais de cada modelo em formato de porcentagem
# O :.2% multiplica o valor por 100 e adiciona o símbolo '%' automaticamente
print('--- Desempenho dos Modelos ---')
print('GNB: {0:.2%}\nKNN: {1:.2%}\nRF: {2:.2%}'.format(accuracy_gnb, accuracy_knn, acuracia))

print('-' * 30)

# Exibindo o vencedor com destaque
print('O Melhor modelo é: {} com o valor: {:.2%} de acuracidade'.format(MelhorModelo, dict_models[MelhorModelo]))

--- Desempenho dos Modelos ---
GNB: 66.67%
KNN: 66.67%
RF: 70.00%
------------------------------
O Melhor modelo é: RF com o valor: 70.00% de acuracidade
