In [1]:
# EDA
import pandas as pd
import seaborn as sns
import plotly.express as px
import numpy as np

sns.set_style('whitegrid')

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

### Carregar os dados

In [2]:
# Carregar os dados já tratados
df_costs = pd.read_csv('./datasets/healthcosts_cleaned.csv')

In [3]:
# Mostrar as primeiras linhas
df_costs.head(10)

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,0,19,female,27.9,0,1,southwest,16884.924
1,1,18,male,33.77,1,0,southeast,1725.5523
2,2,28,male,33.0,3,0,southeast,4449.462
3,3,33,male,22.705,0,0,northwest,21984.47061
4,4,32,male,28.88,0,0,northwest,3866.8552
5,5,31,female,25.74,0,0,southeast,3756.6216
6,6,46,female,33.44,1,0,southeast,8240.5896
7,7,37,female,27.74,3,0,northwest,7281.5056
8,8,37,male,29.83,2,0,northeast,6406.4107
9,9,60,female,25.84,0,0,northwest,28923.13692


In [4]:
# Mostrar as ultimas linhas
df_costs.tail(10)

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
1328,1328,23,female,24.225,2,0,northeast,22395.74424
1329,1329,52,male,38.6,2,0,southwest,10325.206
1330,1330,57,female,25.74,2,0,southeast,12629.1656
1331,1331,23,female,33.4,0,0,southwest,10795.93733
1332,1332,52,female,44.7,3,0,southwest,11411.685
1333,1333,50,male,30.97,3,0,northwest,10600.5483
1334,1334,18,female,31.92,0,0,northeast,2205.9808
1335,1335,18,female,36.85,0,0,southeast,1629.8335
1336,1336,21,female,25.8,0,0,southwest,2007.945
1337,1337,61,female,29.07,0,1,northwest,29141.3603


In [5]:
# Mostrar a estrutura do DF
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1338 non-null   int64  
 1   age              1338 non-null   int64  
 2   sex              1338 non-null   object 
 3   bmi              1338 non-null   float64
 4   children         1338 non-null   int64  
 5   smoker           1338 non-null   int64  
 6   region           1338 non-null   object 
 7   medical charges  1338 non-null   float64
dtypes: float64(2), int64(4), object(2)
memory usage: 83.8+ KB


### Preparação dos dados

In [6]:
# Preparar os dados para o modelo
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [7]:
# Carregar preprocessor
import joblib
preprocessor = joblib.load('./preprocessor_dataset_healthcosts.pkl')

In [8]:
# Dividir o dataset entre treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [9]:
# Aplicar o preprocessor em treinamento e teste
# Treinamento = fit & transform
# Teste = transform, considerando o treinamento que foi deito anteriormente
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
# Mostrar as dimensões dos conjuntos
print(f'Treinamento: {X_train.shape}')
print(f'Teste: {X_test.shape}')

Treinamento: (1070, 10)
Teste: (268, 10)


### Treinamento do Modelo Stacking

In [11]:
# Criar o modelo de Stacking Regressor

# Algoritmos base
lr_model = LinearRegression()
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

# Meta-Modelo
huber_model = HuberRegressor()

# Stacking Model
stacking_model = StackingRegressor(
    estimators=[
        ('linear regression', lr_model),
        ('elastic', elastic_model),
        ('decision tree', tree_model),
    ],
    final_estimator=huber_model,
    passthrough=False
    # Passthrough = False, usa apenas os estimadores de cada algoritmo base (Vanilla)
    # Passthrough = True, usa os resultados dos estimadores de cada algoritmo base mas o dataset original (Blending)
)

In [12]:
# Treinar o modelo 
stacking_model.fit(X_train, y_train)

### Analise de Resultados

In [13]:
# Fazer as predições com base no modelo treinado
y_pred = stacking_model.predict(X_test)

# Mostrar predict
y_pred

array([ 7695.79959724, 43370.93405492, 20538.53371989, 23306.79628201,
       38542.87992825,  9712.19066584,  7864.68467422, 12191.71869977,
        5530.5934329 ,  9405.74504414,  8707.54848159, 11508.65098615,
        7327.30156712,  2463.04849912,  4553.46990533, 12495.95535168,
        3100.39761182,  6935.23743426, 18344.04697192, 20101.55089448,
        4888.24901415,  6829.38484691, 53567.50549203, 10875.11501712,
        5634.50271863, 15015.90063325, 11475.01694231,  1573.00910166,
       30832.39007449, 19316.03404105,  1437.97164139, 23004.37525715,
        2486.96709226,  2806.40056811,  7060.57244398, 24988.87749613,
        7348.17486445,  1418.96078897, 11657.23837064,  7247.37658504,
       11211.77968941,  1234.95208951,  3676.24582513,  1443.33461361,
       12248.44036693, 12752.260629  , 11556.60219094, 40223.96058083,
        8302.20504108, 12560.89421783,  4744.64392108, 37488.43146823,
        8832.32557457, 46791.81649497, 18839.48431266, 33244.72604585,
      

In [14]:
# Avaliar métricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [15]:
# Mostrar o Erro e R2 do modelo
print(f'Root Mean Squared Error: {rmse}')
print(f'R2: {r2}')

Root Mean Squared Error: 6641.236668309897
R2: 0.7463459096735823


In [16]:
# Calcular a importância considerando os modelos do Stacking Regressor
importances = []

for estimator in stacking_model.estimators_:
    if hasattr(estimator, 'coef_'):
        importances.append(np.abs(estimator.coef_))
    elif hasattr(estimator, 'feature_importances_'):
        importances.append(np.abs(estimator.feature_importances_))
    else:
        print(f'Não foi possível carregar a importância das variáveis do modelo {type(estimator).__name__}')

In [17]:
# Calcular a média das importâncias
importancia_media = np.mean(importances, axis=0)

In [18]:
# Normalizar as importâncias
feature_importance = importancia_media / np.sum(importancia_media)

In [19]:
# Obter nos nomes das features
feature_names = preprocessor.get_feature_names_out()

In [20]:
# Criar um dataframe com as importâncias e os nomes das features
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

In [21]:
# Ordenar o Dataframe pela importância
importance_df = importance_df.sort_values(by='importance', ascending=True)

In [None]:
# Criar o gráfico de barras para apresentar a importância das features

In [23]:
fig = px.bar(
    importance_df,
    x='importance',
    y='feature',
    title='Importância das Features - Stacking Regressor',
    orientation='h'
)

fig.show()

### Propriedades do Modelo

In [24]:
# Mostrar a evidência do Stacking Regressor

# Selecionar uma amostra para fazer a predição
X_sample = X_test[7].reshape(1, -1)

# Predições individuais dos estimadores
linear_pred = stacking_model.named_estimators_['linear regression'].predict(X_sample)
elastic_pred = stacking_model.named_estimators_['elastic'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision tree'].predict(X_sample)

# Predição final do Stacking Regressor
stacking_pred = stacking_model.predict(X_sample) 

In [25]:
# Mostrar predições
print(f'Predição do Regressão Linear: {linear_pred[0]}')
print(f'Predição do ElasticNet: {elastic_pred[0]}')
print(f'Predição da Árvore de Decisão: {tree_pred[0]}')

print(f'Predição final do Stacking Regressor: {stacking_pred[0]}')

Predição do Regressão Linear: 14793.194288532948
Predição do ElasticNet: 13782.690606341423
Predição da Árvore de Decisão: 11856.4115
Predição final do Stacking Regressor: 12191.718699769255
