In [105]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np

sns.set_style('whitegrid')

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

### Carregar os dados

In [106]:
# Carregar os dados (já transformados e limpos)
df_costs = pd.read_csv('./datasets/healthcosts_cleaned.csv')

In [107]:
# Mostrar as primeiras linhas
df_costs.head(10)

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,0,19,female,27.9,0,1,southwest,16884.924
1,1,18,male,33.77,1,0,southeast,1725.5523
2,2,28,male,33.0,3,0,southeast,4449.462
3,3,33,male,22.705,0,0,northwest,21984.47061
4,4,32,male,28.88,0,0,northwest,3866.8552
5,5,31,female,25.74,0,0,southeast,3756.6216
6,6,46,female,33.44,1,0,southeast,8240.5896
7,7,37,female,27.74,3,0,northwest,7281.5056
8,8,37,male,29.83,2,0,northeast,6406.4107
9,9,60,female,25.84,0,0,northwest,28923.13692


In [108]:
# Mostrar as ultimas linhas
df_costs.tail(10)

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
1328,1328,23,female,24.225,2,0,northeast,22395.74424
1329,1329,52,male,38.6,2,0,southwest,10325.206
1330,1330,57,female,25.74,2,0,southeast,12629.1656
1331,1331,23,female,33.4,0,0,southwest,10795.93733
1332,1332,52,female,44.7,3,0,southwest,11411.685
1333,1333,50,male,30.97,3,0,northwest,10600.5483
1334,1334,18,female,31.92,0,0,northeast,2205.9808
1335,1335,18,female,36.85,0,0,southeast,1629.8335
1336,1336,21,female,25.8,0,0,southwest,2007.945
1337,1337,61,female,29.07,0,1,northwest,29141.3603


In [109]:
# Mostrar estrutura
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1338 non-null   int64  
 1   age              1338 non-null   int64  
 2   sex              1338 non-null   object 
 3   bmi              1338 non-null   float64
 4   children         1338 non-null   int64  
 5   smoker           1338 non-null   int64  
 6   region           1338 non-null   object 
 7   medical charges  1338 non-null   float64
dtypes: float64(2), int64(4), object(2)
memory usage: 83.8+ KB


### Preparação dos dados

In [110]:
# Preparar os dados para o modelo
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [111]:
# Importar preprocessor já salvo anteriormente
import joblib

preprocessor = joblib.load('./preprocessor_dataset_healthcosts.pkl')

In [112]:
# Dividir os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [113]:
# Aplicar preprocessor nos dados de treinamento e teste
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [114]:
# Mostrar as dimensões dos conjuntos
print(f'Dados de Treinamento: {X_train.shape}')
print(f'Dados de Teste: {X_test.shape}')

Dados de Treinamento: (1070, 10)
Dados de Teste: (268, 10)


### Treinamento do Modelo

In [115]:
# Criar o modelo de AdaBoost Regressor
boosting_model = AdaBoostRegressor(
    estimator=LinearRegression(),
    n_estimators=8,
    learning_rate=1.0,
    random_state=51
)

In [116]:
# Treinar o modelo
boosting_model.fit(X_train, y_train)

### Análise dos Resultados

In [117]:
# Fazer predições no conjunto de testes
y_pred = boosting_model.predict(X_test)

# Mostrar y_pred
y_pred

array([10800.23473216, 37769.24929587,  4867.09652517, 12469.08381743,
       34632.11135672, 13355.96825399, 13599.5756217 , 16852.48066042,
        7174.49363363, 12593.14262104, 11480.13432879, 13947.57940409,
       11823.6915029 ,  6277.67668536,  6741.95125161, 14022.58925865,
        7752.15162778,  7468.44027107, 25796.78129254, 29483.60833705,
       13163.23155702, 10050.09798002, 33222.29917596, 14456.95033626,
        7577.71063807, 17255.15612167, 11355.73094706,  5160.65555925,
       24146.62216307,  9416.59054111,  6744.43508419, 31286.21532125,
        7963.07493631,  6655.44071248,  9094.83966498, 12800.99289499,
       15496.18299594,  4081.74934738, 13692.20982359, 10036.76978533,
       11882.87945656,  2504.06956197,  7416.8049379 ,  4226.77720349,
        5593.98909358, 16485.70336567, 17024.12355557, 35421.64221568,
        8559.92288549, 14404.95568638,  7736.66291608, 30869.01138139,
        8178.63878845, 41246.85098578,  5236.49665571, 27785.72031111,
      

In [118]:
# Avaliar métricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [119]:
# Mostrar as métricas
print(f'Root Mean Squared Error: {rmse}')
print(f'R2: {r2}')

Root Mean Squared Error: 6926.936615417019
R2: 0.7240525513163432


In [120]:
# Calcular a importância das features usando os coeficientes

# Obter os coeficientes de cada estimador
coefs = np.array([estimator.coef_ for estimator in boosting_model.estimators_])

In [121]:
# Calcular média dos coeficientes absolutos
importances = np.mean(np.abs(coefs), axis=0)

In [122]:
# Normalizar as importâncias
importances = importances / np.sum(importances)

In [123]:
# Obter os nomes das features
feature_names = preprocessor.get_feature_names_out()
feature_names

array(['num__age', 'num__bmi', 'num__children', 'num__smoker',
       'cat__sex_female', 'cat__sex_male', 'cat__region_northeast',
       'cat__region_northwest', 'cat__region_southeast',
       'cat__region_southwest'], dtype=object)

In [124]:
# Criar um Dataframe com as importâncias e os nomes das features
importance_df = pd.DataFrame({ 'feature': feature_names, 'importance': importances })

In [125]:
# Ordenar o DataFrame pela importância
importance_df = importance_df.sort_values('importance', ascending=True)

In [126]:
# Criar o gráfico de barras com a importância das features
fig = px.bar(
    importance_df,
    x='importance',
    y='feature',
    title='Importância das Features',
    orientation='h'
) 

fig.update_xaxes(tickangle=45)
fig.show()

### Propriedades do Modelo

In [127]:
# Erros dos estimadores
boosting_model.estimator_errors_

array([0.1325525 , 0.20039444, 0.26369109, 0.35272733, 0.42748131,
       0.41440901, 0.4657024 , 0.48312978])

In [128]:
boosting_model.estimator_weights_

array([1.87857623, 1.3838309 , 1.02687142, 0.60707232, 0.2921348 ,
       0.34576816, 0.1374062 , 0.06750651])

In [129]:
# Salvar Dataframe como CSV
df_costs.to_csv('./datasets/healthcosts_cleaned.csv', index=False)

In [130]:
# Salvar o preprocessor
import joblib

joblib.dump(preprocessor, './preprocessor_dataset_healthcosts.pkl')

['./preprocessor_dataset_healthcosts.pkl']