<a href="https://colab.research.google.com/github/geoifg/SMS/blob/main/AutoML_Titanic2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Auto Machine Learning (AutoML)**

O pacote PyCaret possui recursos de automação de aprendizado de máquina, como seleção automática de modelos (auto machine learning), otimização de hiperparâmetros e fluxo de trabalho de validação cruzada integrado. Esses recursos permitem que os usuários experimentem rapidamente diferentes algoritmos e configurações, facilitando a comparação e seleção dos melhores modelos.

In [1]:
!pip install pycaret



In [2]:
# Importar bibliotecas
import pandas as pd
from pycaret.classification import *

In [3]:
# Carregar o dataset
data = pd.read_csv("https://raw.githubusercontent.com/geoifg/SMS/main/titanic.csv", encoding = 'latin1', sep=';') #importação dos dados


In [4]:
# Excluir as variáveis indesejadas
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [5]:
# Converter colunas para variáveis categóricas
data['Survived'] = data['Survived'].astype('category')
data['Pclass'] = data['Pclass'].astype('category')
data['Sex'] = data['Sex'].astype('category')
data['Embarked'] = data['Embarked'].astype('category')

In [6]:
# Converter a variável 'Fare' para numérica
data['Fare'] = pd.to_numeric(data['Fare'], errors='coerce')

In [7]:
# Excluir linhas com valores NA
data.dropna(inplace=True)

In [8]:
# Verificar os tipos de dados das colunas
print(data.dtypes)

Survived    category
Pclass      category
Sex         category
Age          float64
SibSp          int64
Parch          int64
Fare         float64
Embarked    category
dtype: object


In [9]:
# Verificar estatística descritiva das colunas
print(data.describe())

              Age       SibSp       Parch        Fare
count  692.000000  692.000000  692.000000  692.000000
mean    29.436662    0.524566    0.424855   94.881500
std     14.515956    0.940011    0.858340  166.733069
min      0.420000    0.000000    0.000000    0.000000
25%     20.000000    0.000000    0.000000   12.443750
50%     28.000000    0.000000    0.000000   27.750000
75%     38.000000    1.000000    1.000000   79.810500
max     80.000000    5.000000    6.000000  910.792000


In [10]:
# Verificar valores ausentes nas colunas
print(data.isna().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [11]:
# Dividir o dataset em treinamento e teste (70% treinamento, 30% teste)
train_data = data.sample(frac=0.7, random_state=123)
test_data = data.drop(train_data.index)

In [12]:
# Inicializar o ambiente PyCaret
exp = setup(data = train_data, target = 'Survived', session_id = 42, normalize=True, fold_strategy='kfold',            # <- Estratégia de cross-validation (kfold, stratifiedkfold, etc.)
    fold=10)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(484, 8)"
4,Transformed data shape,"(484, 12)"
5,Transformed train set shape,"(338, 12)"
6,Transformed test set shape,"(146, 12)"
7,Numeric features,4
8,Categorical features,3
9,Preprocess,True


In [13]:
# Criar o modelo AutoML
best_model = compare_models(fold=10, sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8313,0.8659,0.7489,0.8289,0.7777,0.6379,0.6493,0.85
ridge,Ridge Classifier,0.8193,0.8652,0.7379,0.795,0.7589,0.6108,0.6185,0.083
lda,Linear Discriminant Analysis,0.8193,0.8648,0.7379,0.795,0.7589,0.6108,0.6185,0.082
lightgbm,Light Gradient Boosting Machine,0.7635,0.8571,0.6477,0.7334,0.6825,0.4921,0.4992,0.241
gbc,Gradient Boosting Classifier,0.7633,0.8546,0.6849,0.7283,0.6938,0.5001,0.5109,0.178
xgboost,Extreme Gradient Boosting,0.7427,0.8382,0.6499,0.6937,0.6651,0.4539,0.4596,0.129
rf,Random Forest Classifier,0.746,0.8377,0.6493,0.7121,0.6697,0.4619,0.4719,0.243
knn,K Neighbors Classifier,0.784,0.8324,0.6664,0.774,0.7096,0.537,0.5468,0.1
nb,Naive Bayes,0.7839,0.8262,0.7176,0.7381,0.7211,0.5407,0.5478,0.081
ada,Ada Boost Classifier,0.79,0.8204,0.72,0.7517,0.7325,0.5572,0.5606,0.247


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [14]:
#Testando modelos específicos
gbm = create_model('gbc')    # Gradient Boosting
rf = create_model('rf')      # Random Forest
xgboost = create_model('xgboost') # XGBoost


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6765,0.8131,0.7059,0.6667,0.6857,0.3529,0.3536
1,0.7647,0.8571,0.7692,0.6667,0.7143,0.516,0.5198
2,0.8529,0.9071,0.7857,0.8462,0.8148,0.6931,0.6944
3,0.7353,0.7614,0.5833,0.6364,0.6087,0.4093,0.4102
4,0.8529,0.8576,0.875,0.8235,0.8485,0.7059,0.7071
5,0.8235,0.9249,0.6364,0.7778,0.7,0.5768,0.5826
6,0.6765,0.8458,0.7273,0.5,0.5926,0.3392,0.3556
7,0.7353,0.8877,0.6,0.75,0.6667,0.4516,0.4594
8,0.7879,0.8778,0.5,1.0,0.6667,0.5352,0.6045
9,0.7273,0.8135,0.6667,0.6154,0.64,0.4211,0.4219


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6471,0.7612,0.5882,0.6667,0.625,0.2941,0.2962
1,0.7059,0.826,0.6154,0.6154,0.6154,0.3773,0.3773
2,0.7941,0.9071,0.6429,0.8182,0.72,0.5609,0.5711
3,0.7353,0.7917,0.5,0.6667,0.5714,0.3855,0.3939
4,0.6765,0.7847,0.6875,0.6471,0.6667,0.3529,0.3536
5,0.7941,0.8617,0.6364,0.7,0.6667,0.5182,0.5195
6,0.7647,0.8953,0.8182,0.6,0.6923,0.509,0.5251
7,0.7059,0.8404,0.6,0.6923,0.6429,0.395,0.3979
8,0.8182,0.8816,0.5714,1.0,0.7273,0.6056,0.659
9,0.8182,0.8274,0.8333,0.7143,0.7692,0.6207,0.6257


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6471,0.8131,0.5882,0.6667,0.625,0.2941,0.2962
1,0.7647,0.8645,0.7692,0.6667,0.7143,0.516,0.5198
2,0.8235,0.8714,0.7143,0.8333,0.7692,0.6277,0.6326
3,0.7059,0.7614,0.5,0.6,0.5455,0.3307,0.3337
4,0.7353,0.7882,0.75,0.7059,0.7273,0.4706,0.4714
5,0.7941,0.8636,0.5455,0.75,0.6316,0.4936,0.5057
6,0.7059,0.8458,0.7273,0.5333,0.6154,0.3863,0.3985
7,0.7353,0.8737,0.6667,0.7143,0.6897,0.4594,0.4602
8,0.7576,0.8628,0.5714,0.8,0.6667,0.4844,0.5013
9,0.7576,0.8373,0.6667,0.6667,0.6667,0.4762,0.4762


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
#Testando o tuning de um modelo específico
tuned_rf = tune_model(rf, optimize='AUC')


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7941,0.872,0.7059,0.8571,0.7742,0.5882,0.5976
1,0.8235,0.8608,0.6923,0.8182,0.75,0.6151,0.6202
2,0.8235,0.9357,0.6429,0.9,0.75,0.6194,0.6404
3,0.8235,0.822,0.6667,0.8,0.7273,0.5984,0.6039
4,0.7941,0.8854,0.8125,0.7647,0.7879,0.5882,0.5893
5,0.8529,0.8794,0.6364,0.875,0.7368,0.6383,0.6539
6,0.7941,0.8696,0.7273,0.6667,0.6957,0.5405,0.5417
7,0.7941,0.8947,0.6,0.9,0.72,0.5673,0.5965
8,0.7879,0.9192,0.5,1.0,0.6667,0.5352,0.6045
9,0.8182,0.8492,0.75,0.75,0.75,0.6071,0.6071


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [16]:
# Avaliar ambos no conjunto de validação cruzada interno
evaluate_model(best_model)
evaluate_model(tuned_rf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

<Figure size 800x550 with 0 Axes>

In [17]:
# Comparação direta usando a função pull()
# Resultado da validação cruzada de cada modelo
result_best = pull()


In [30]:
# Comparar no conjunto de teste externo
pred_best = predict_model(best_model, data=test_data)
pred_rf = predict_model(tuned_rf, data=test_data)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.75,0.7882,0.6329,0.6849,0.6579,0.4614,0.4623


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7885,0.8298,0.5696,0.8182,0.6716,0.5229,0.5415


In [29]:
# Criar um modelo MLP (Redes Neurais com Multilayer Perceptron)
mlp = create_model('mlp')

# Treinar o modelo MLP
trained_model = tune_model(mlp)

# Avaliar o modelo no conjunto de teste
predictions = predict_model(trained_model, data=test_data)

# Visualizar as predições
print(predictions)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7647,0.8339,0.7059,0.8,0.75,0.5294,0.5331
1,0.7353,0.7949,0.6154,0.6667,0.64,0.4312,0.4321
2,0.8529,0.9,0.7857,0.8462,0.8148,0.6931,0.6944
3,0.8235,0.7879,0.6667,0.8,0.7273,0.5984,0.6039
4,0.7059,0.75,0.6875,0.6875,0.6875,0.4097,0.4097
5,0.8529,0.836,0.6364,0.875,0.7368,0.6383,0.6539
6,0.8235,0.8577,0.7273,0.7273,0.7273,0.5968,0.5968
7,0.8235,0.9333,0.6667,0.9091,0.7692,0.6318,0.6517
8,0.8182,0.8515,0.5714,1.0,0.7273,0.6056,0.659
9,0.8485,0.873,0.8333,0.7692,0.8,0.6784,0.6798


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7647,0.8789,0.7647,0.7647,0.7647,0.5294,0.5294
1,0.7941,0.8132,0.6154,0.8,0.6957,0.5441,0.5548
2,0.8824,0.9214,0.8571,0.8571,0.8571,0.7571,0.7571
3,0.8529,0.7992,0.75,0.8182,0.7826,0.6718,0.6733
4,0.7941,0.8646,0.9375,0.7143,0.8108,0.5939,0.6205
5,0.8824,0.9269,0.6364,1.0,0.7778,0.7031,0.7363
6,0.8529,0.8379,0.8182,0.75,0.7826,0.6718,0.6733
7,0.8529,0.9193,0.8,0.8571,0.8276,0.6996,0.7009
8,0.8485,0.8365,0.6429,1.0,0.7826,0.6746,0.7134
9,0.8182,0.8651,0.6667,0.8,0.7273,0.5926,0.5981


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,MLP Classifier,0.7644,0.7882,0.6329,0.7143,0.6711,0.4887,0.4908


    Pclass     Sex   Age  SibSp  Parch        Fare Embarked Survived  \
2        3  female  26.0      0      0    7.925000        S        1   
3        1  female  35.0      1      0   53.099998        S        1   
9        2  female  14.0      1      0  300.708008        C        1   
11       1  female  58.0      0      0   26.549999        S        1   
15       2  female  55.0      0      0   16.000000        S        1   
..     ...     ...   ...    ...    ...         ...      ...      ...   
873      3    male  47.0      0      0    9.000000        S        0   
882      3  female  22.0      0      0  105.167000        S        0   
884      3    male  25.0      0      0    7.050000        S        0   
886      2    male  27.0      0      0   13.000000        S        0   
889      1    male  26.0      0      0   30.000000        C        1   

     prediction_label  prediction_score  
2                   1            0.5748  
3                   1            0.8798  
9        

In [31]:
# Comparar no conjunto de teste externo
pred_best = predict_model(best_model, data=test_data)
pred_rf = predict_model(tuned_rf, data=test_data)
pred_mlp = predict_model(trained_model, data=test_data)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.75,0.7882,0.6329,0.6849,0.6579,0.4614,0.4623


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.7885,0.8298,0.5696,0.8182,0.6716,0.5229,0.5415


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,MLP Classifier,0.7644,0.7882,0.6329,0.7143,0.6711,0.4887,0.4908
