In [138]:
# Importação das bibliotecas
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
# importação do dataset
dataset = pd.read_csv('../Data/mt_cars.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [139]:
# Apagando a coluna Unnamed: 0
dataset = dataset.drop('Unnamed: 0', axis=1)
dataset.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [140]:
# Criação das variáveis X e y
previsores = dataset.iloc[:, 1:11].values
classe = dataset.iloc[:, 0].values

In [141]:
# Divisão da base de dados entre treinamento e teste (30% para testar e 70% para treinar)
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(previsores,
                                                                  classe,
                                                                  test_size = 0.3,
                                                                  random_state = 0)

In [142]:
# Criação do modelo e treinamento
floresta = RandomForestRegressor(n_estimators = 100)
floresta.fit(X_treinamento, y_treinamento)

# Previsão

In [143]:
previsoes = floresta.predict(X_teste)
previsoes

array([17.401, 18.819, 19.503, 27.217, 13.398, 13.462, 16.791, 25.253,
       23.89 , 17.519])

# Avaliando o modelo

In [144]:
import numpy as np

# Mean Error (ME)
me = np.mean(y_teste - previsoes)
print(f"Mean Error (ME): {me}")

# Mean Absolute Error (MAE)
mae = np.mean(np.abs(y_teste - previsoes))
print(f"Mean Absolute Error (MAE): {mae}")

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(np.mean((y_teste - previsoes)**2))
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Mean Percentage Error (MPE)
mpe = np.mean((y_teste - previsoes) / y_teste) * 100
print(f"Mean Percentage Error (MPE): {mpe} %")

# Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_teste - previsoes) / y_teste)) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape} %")

Mean Error (ME): -1.7453000000000123
Mean Absolute Error (MAE): 2.1551000000000067
Root Mean Squared Error (RMSE): 2.453777883183412
Mean Percentage Error (MPE): -10.921018963874065 %
Mean Absolute Percentage Error (MAPE): 13.267062919917999 %


# Seleção de Atributos

In [145]:
from sklearn.ensemble import ExtraTreesRegressor

# Criação do modelo e treinamento de seleção de atributos
sa = ExtraTreesRegressor()
sa.fit(X_treinamento, y_treinamento)
atributos_importantes = sa.feature_importances_

# argsort()[::-1] -> Ordena os índices em ordem decrescente
indices = atributos_importantes.argsort()[::-1]

# Imprimindo os atributos e seus valores de importância
# 1+indices[i] para desconsiderar a coluna mpg
for i in range(indices.shape[0]):
    print(f"{i+1}º {dataset.columns[1+indices[i]]} ({atributos_importantes[indices[i]]})")

cyl: 0.33048198027866155
wt: 0.2301653765690489
disp: 0.18768665977519394
hp: 0.09295189937691331
vs: 0.07946530301883103
qsec: 0.023876282271901344
carb: 0.01980143657124836
am: 0.01875984508939095
drat: 0.009896774392093344
gear: 0.006914442656717381


# Criando um novo modelo com os melhores atributos independentes

In [146]:
# Escolhendo os 5 atributos mais importantes
previsores2 = dataset.iloc[:, [1, 2, 3, 5, 7]].values
previsores2.shape

(32, 5)

In [147]:
# Divisão da base de dados entre treinamento e teste (30% para testar e 70% para treinar)
X_treinamento2, X_teste2, y_treinamento2, y_teste2 = train_test_split(previsores2,
                                                                  classe,
                                                                  test_size = 0.3,
                                                                  random_state = 0)

In [148]:
# Criação do modelo e treinamento
floresta2 = RandomForestRegressor(n_estimators = 100)
floresta2.fit(X_treinamento2, y_treinamento2)

In [149]:
previsoes2 = floresta2.predict(X_teste2)
previsoes2

array([16.88 , 17.928, 19.22 , 28.208, 11.617, 12.522, 16.586, 25.924,
       23.799, 16.906])

In [150]:
import numpy as np

# Mean Error (ME)
me = np.mean(y_teste2 - previsoes2)
print(f"Mean Error (ME): {me}")

# Mean Absolute Error (MAE)
mae = np.mean(np.abs(y_teste2 - previsoes2))
print(f"Mean Absolute Error (MAE): {mae}")

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(np.mean((y_teste2 - previsoes2)**2))
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Mean Percentage Error (MPE)
mpe = np.mean((y_teste2 - previsoes2) / y_teste2) * 100
print(f"Mean Percentage Error (MPE): {mpe} %")

# Mean Absolute Percentage Error (MAPE)
mape = np.mean(np.abs((y_teste2 - previsoes2) / y_teste2)) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape} %")

Mean Error (ME): -1.3790000000000082
Mean Absolute Error (MAE): 2.0108000000000046
Root Mean Squared Error (RMSE): 2.4852048205329194
Mean Percentage Error (MPE): -7.860116438138359 %
Mean Absolute Percentage Error (MAPE): 12.113135799729143 %
