In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score

### Carregar os dados

In [None]:
# Carga de dados
df_costs = pd.read_csv('./datasets/healthcosts.csv')

In [3]:
# Mostrar as primeiras linhas do dataframe
df_costs.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [4]:
# Mostrar as últimas linhas do dataframe
df_costs.tail(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
1328,23,female,24.225,2,no,northeast,22395.74424
1329,52,male,38.6,2,no,southwest,10325.206
1330,57,female,25.74,2,no,southeast,12629.1656
1331,23,female,33.4,0,no,southwest,10795.93733
1332,52,female,44.7,3,no,southwest,11411.685
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [5]:
# Mostrar estrutura do Dataframe
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   object 
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Feature Engineering

In [6]:
# Mostrar e remover as colunas categóricas que possuem somente um valor possível
for column in df_costs.select_dtypes(include=['object']).columns:
    if df_costs[column].nunique() == 1:
        print(f'Coluna {column} possui somente um valor possível: {df_costs[column].unique()}')


In [7]:
# Mostrar os valores possíveis para todas as colunas categóricas
for column in df_costs.select_dtypes(include=['object']).columns:
    print(f'Coluna {column} possui somente um valor possível: {df_costs[column].unique()}')

Coluna sex possui somente um valor possível: ['female' 'male']
Coluna smoker possui somente um valor possível: ['yes' 'no']
Coluna region possui somente um valor possível: ['southwest' 'southeast' 'northwest' 'northeast']


In [8]:
# Mostrar o percentual de valores ausentes para as colunas categóricas
for column in df_costs.select_dtypes(include=['object']).columns:
    contagem_nulas = df_costs[column].isnull().sum()
    print(f'Coluna {column}:  {contagem_nulas / len(df_costs) * 100:.2f}%')

Coluna sex:  0.00%
Coluna smoker:  0.00%
Coluna region:  0.00%


In [9]:
# Apresentar Estatísticas Descritivas
df_costs.describe()

Unnamed: 0,age,bmi,children,medical charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [10]:
# Mostrar e remover as colunas numéricas que possuem somente um valor possível
for column in df_costs.select_dtypes(include=['number']).columns:
    if df_costs[column].nunique() == 1:
        print(f'Coluna {column} possui somente um valor possível: {df_costs[column].unique()}')

In [11]:
# Mostrar o percentual de valores ausentes para as colunas numéricas
for column in df_costs.select_dtypes(include=['number']).columns:
    contagem_nulas = df_costs[column].isnull().sum()
    print(f'Coluna {column}:  {contagem_nulas / len(df_costs) * 100:.2f}%')

Coluna age:  0.00%
Coluna bmi:  0.00%
Coluna children:  0.00%
Coluna medical charges:  0.00%


In [12]:
# Converter colunas categóricas  com valores Yes e No para 1 e 0
for column in df_costs.select_dtypes(include=['object']).columns:
    valores_unicos = df_costs[column].unique()
    if set(valores_unicos).issubset(set(['yes', 'no'])):
        df_costs[column] = df_costs[column].apply(lambda x: 1 if x == 'yes' else 0)

### EDA

In [13]:
# Mostrar distribuição de custos médicos
fig = px.histogram(df_costs, x='medical charges', nbins=30, title='Distribuição de Custos Médicos')
fig.show()

In [14]:
# Mostrar distribuição de idade
fig = px.histogram(df_costs, x='age', nbins=30, title='Distribuição da Idade')
fig.show()

In [15]:
# Mostrar quantidade filhos
fig = px.histogram(df_costs, x='children', title='Distribuição de Quantidade Filhos')
fig.show()

In [16]:
# Mostrar distribuição de BMI
fig = px.histogram(df_costs, x='bmi', nbins=30, title='Distribuição de BMI')
fig.show()

In [17]:
# Mostrar a distribuição do gênero
fig = px.bar(df_costs['sex'].value_counts(), title='Distribuição por Gênero')
fig.show()

In [18]:
# Mostrar a distribuição da variável Smoker
fig = px.bar(df_costs['smoker'].value_counts(), title='Distribuição da Idade')
fig.show()

In [19]:
# Mostrar a distribuição de região
fig = px.bar(df_costs['region'].value_counts(), title='Distribuição por Região')
fig.show()

In [20]:
# BoxPlot de custos médicos por idade
fig = px.box(df_costs, x='age', y='medical charges', title='Boxplot de Custos Médicos por Idade')
fig.show()

In [21]:
# Boxplot de custos médicos por gênero
fig = px.box(df_costs, x='sex', y='medical charges', title='Boxplot de Custos Médicos por Gênero')
fig.show()

In [22]:
# Boxplot de custos médicos por Smoker (Fumante)
fig = px.box(df_costs, x='smoker', y='medical charges', title='Boxplot de Custos Médicos por Status Fumante')
fig.show()

In [23]:
# Boxplot de Custos Médicos por Região
fig = px.box(df_costs, x='region', y='medical charges', title='Boxplot de Custos Médicos por Região')
fig.show()

In [24]:
# Plot de correlação das variáveis numéricas
corr_matrix = df_costs.select_dtypes(include=['number']).corr()

# Mostrar a matriz de correlação
corr_matrix

Unnamed: 0,age,bmi,children,smoker,medical charges
age,1.0,0.109272,0.042469,-0.025019,0.299008
bmi,0.109272,1.0,0.012759,0.00375,0.198341
children,0.042469,0.012759,1.0,0.007673,0.067998
smoker,-0.025019,0.00375,0.007673,1.0,0.787251
medical charges,0.299008,0.198341,0.067998,0.787251,1.0


In [25]:
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x = corr_matrix.columns,
        y = corr_matrix.index,
        z = np.array(corr_matrix),
        text = corr_matrix.values,
        texttemplate='%{text:.3f}',
        colorscale=px.colors.diverging.RdBu,
        zmin=-1,
        zmax=1,
    )
)

fig.show()

### Preparação dos dados

In [26]:
# Preparar dados para o modelo
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [27]:
# Column Transformer para normalizar variáveis numéricas e OneHotEncode para categóricas
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [28]:
# Dividir os dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [29]:
# Aplicar o COlumn Transformer nos dados de treinamento e teste
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [30]:
# Mostrar as dimensões dos conjuntos
print(f'Dados de treinamento: {X_train.shape}')
print(f'Dados de teste: {X_test.shape}')

Dados de treinamento: (1070, 10)
Dados de teste: (268, 10)


### Treinamento do modelo

In [31]:
# Criar o modelo de Bagging Regressor
bagging_model = BaggingRegressor(
    estimator=LinearRegression(),
    n_estimators=10,
    random_state=51,
    max_samples=0.5,
)

In [32]:
# Treinar o modelo
bagging_model.fit(X_train, y_train)

### Análise dos Resultados

In [33]:
# Fazer predições com base no modelo treinado
y_pred = bagging_model.predict(X_test)

In [34]:
# Mostrar y_pred
y_pred

array([ 9087.08528137, 36560.34441742,  3025.83483828, 11290.31331477,
       34064.9143439 , 11543.64196884, 11530.27386225, 14953.45885337,
        5562.00768848, 10846.90999651,  9334.69137246, 12000.24819473,
        9980.04685901,  4437.71891141,  5683.08400606, 12609.29884573,
        5883.1961558 ,  5367.23737453, 25622.71841274, 28704.41568615,
       10726.27179138,  8517.73703351, 32774.00206499, 13360.32985711,
        6115.44965657, 16023.61962827,  9555.68851668,  2578.60685595,
       23415.38404582,  8305.88725668,  4164.95172708, 30498.17211118,
        6032.85174781,  4968.50009472,  7800.35986899, 11325.53714757,
       13940.00011341,  2482.64746381, 11992.42639264,  7681.45317206,
        9452.67145777,  1056.49992983,  6202.61762314,  2657.98558723,
        4344.32899894, 15174.75649172, 15654.72548577, 35148.62252656,
        8229.0364847 , 12492.74947474,  5457.68842247, 30835.28308534,
        6714.52660342, 40101.13673368,  4833.43587653, 27729.41119805,
      

In [35]:
# Avaliar métricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [36]:
# Mostrar o Erro e R2 do Modelo
print(f'Root Mean Squared Error: {rmse}')
print(f'R2 Score: {r2}')

Root Mean Squared Error: 6614.530724766148
R2 Score: 0.7483818111483818


In [37]:
# Calcular a importância das features usando os coeficientes

# Obter os coeficientes de cada estimator
coefs = np.array([estimator.coef_ for estimator in bagging_model.estimators_])

# Calcular a média dos coeficientes absolutos
feature_importance = np.mean(np.abs(coefs), axis=0)

# Normalizar as importâncias
feature_importance = feature_importance / np.sum(feature_importance)

In [38]:
# Obter os nomes das features
feature_names = preprocessor.get_feature_names_out()

In [39]:
# Criar um dataframe com as importâncias e os nomes
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

# Ordenar o Dataframe pelo importância
importance_df = importance_df.sort_values('importance', ascending=True)

In [40]:
# Criar o gráfico de barras para mostrar a importância das features
fig = px.bar(importance_df, x='importance', y='feature', title='Importância das Features', orientation='h')
fig.update_xaxes(tickangle=45)
fig.show()

### Verificar propriedades o modelo

In [41]:
bagging_model.estimators_samples_

[array([ 503,  347,  592, 1050,  559,  304,  551,  735,  214,  951,  730,
         408,  675,  411,  707,  503,  574,   99,  395,  726,  400,  734,
         532,  623,  414,  885,  630, 1005,  567,  198,  860,  522,  569,
         946,  979,  806,  919,  528,  657,  449,  267,  171,  321,  676,
         192,   56,  464,  284,    9,  800,  262,  685,  692,  323,  738,
         993,  256,  660,  473,  219,   27,  885,  880,  653,  460,  499,
         660,   94,  121,  916,  282,  905,  354,  706,  358,  634,  418,
         982,  423,   21,  164,  992,  443,  994,  552,  155,  613,  136,
         776,  425,  612,  289,   70,  767,   59,  424,  580,  672,  423,
         241,  968,  221,  317,  866,  608,   67,  924,  848,  819,  559,
         717,  510,  342,  643,  157,  262,  127,  339,  567,  881,  506,
         364,  720,  733,  531,   84,  673,  891,  675,  798,  768, 1000,
         746, 1020, 1001, 1056,  103,  123,  243,  201,  569,  114,  116,
          90,  282,   44,  165,  140, 

In [42]:
bagging_model.estimators_samples_[2].shape

(535,)

In [43]:
bagging_model.estimators_features_

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])]

### Salvar dados e preprocessador do modelo

In [44]:
# Salvar dataframe como CSV
df_costs.to_csv('./datasets/healthcosts_cleaned.csv')

In [45]:
# Salvar o preprocessor
import joblib

joblib.dump(preprocessor, './preprocessor_dataset_healthcosts.pkl')

['./preprocessor_dataset_healthcosts.pkl']