<a href="https://colab.research.google.com/github/geoifg/SMS/blob/main/AutoML_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Auto Machine Learning (AutoML)**

O pacote PyCaret possui recursos de automação de aprendizado de máquina, como seleção automática de modelos (auto machine learning), otimização de hiperparâmetros e fluxo de trabalho de validação cruzada integrado. Esses recursos permitem que os usuários experimentem rapidamente diferentes algoritmos e configurações, facilitando a comparação e seleção dos melhores modelos.

In [1]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.1/486.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-1.1.3.tar.gz (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imbalanced-learn>=0.12.0 (from p

In [2]:
# Importar bibliotecas
import pandas as pd
from pycaret.classification import *

In [3]:
# Carregar o dataset
data = pd.read_csv("https://raw.githubusercontent.com/geoifg/SMS/main/titanic.csv", encoding = 'latin1', sep=';') #importação dos dados


In [4]:
# Excluir as variáveis indesejadas
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [5]:
# Converter colunas para variáveis categóricas
data['Survived'] = data['Survived'].astype('category')
data['Pclass'] = data['Pclass'].astype('category')
data['Sex'] = data['Sex'].astype('category')
data['Embarked'] = data['Embarked'].astype('category')

In [6]:
# Converter a variável 'Fare' para numérica
data['Fare'] = pd.to_numeric(data['Fare'], errors='coerce')

In [7]:
# Excluir linhas com valores NA
data.dropna(inplace=True)

In [8]:
# Verificar os tipos de dados das colunas
print(data.dtypes)

Survived    category
Pclass      category
Sex         category
Age          float64
SibSp          int64
Parch          int64
Fare         float64
Embarked    category
dtype: object


In [9]:
# Verificar estatística descritiva das colunas
print(data.describe())

              Age       SibSp       Parch        Fare
count  692.000000  692.000000  692.000000  692.000000
mean    29.436662    0.524566    0.424855   94.881500
std     14.515956    0.940011    0.858340  166.733069
min      0.420000    0.000000    0.000000    0.000000
25%     20.000000    0.000000    0.000000   12.443750
50%     28.000000    0.000000    0.000000   27.750000
75%     38.000000    1.000000    1.000000   79.810500
max     80.000000    5.000000    6.000000  910.792000


In [10]:
# Verificar valores ausentes nas colunas
print(data.isna().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [11]:
# Dividir o dataset em treinamento e teste (70% treinamento, 30% teste)
train_data = data.sample(frac=0.7, random_state=123)
test_data = data.drop(train_data.index)

In [15]:
# Configurar o ambiente PyCaret. Ao usar o Pycaret direto no Python, é possível setar um parâmetro de seed (session_id)
clf = setup(data=train_data, target='Survived', train_size=0.7, session_id=100)

Unnamed: 0,Description,Value
0,Session id,100
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(484, 8)"
4,Transformed data shape,"(484, 12)"
5,Transformed train set shape,"(338, 12)"
6,Transformed test set shape,"(146, 12)"
7,Numeric features,4
8,Categorical features,3
9,Preprocess,True


In [16]:
# Criar o modelo AutoML
best_model = compare_models(fold=10, sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8105,0.8754,0.733,0.7871,0.7531,0.6004,0.6071,0.376
xgboost,Extreme Gradient Boosting,0.787,0.8639,0.7264,0.7387,0.7268,0.5529,0.559,0.197
rf,Random Forest Classifier,0.7841,0.8628,0.7033,0.7472,0.7172,0.5437,0.5512,0.272
lightgbm,Light Gradient Boosting Machine,0.8196,0.8565,0.7495,0.7984,0.7661,0.6202,0.6281,0.931
ridge,Ridge Classifier,0.7989,0.8483,0.7033,0.7771,0.7375,0.5751,0.578,0.105
lda,Linear Discriminant Analysis,0.8019,0.8483,0.7104,0.779,0.742,0.5816,0.5846,0.167
lr,Logistic Regression,0.8017,0.8479,0.7335,0.766,0.7472,0.5846,0.5872,0.684
et,Extra Trees Classifier,0.7696,0.8357,0.7115,0.7129,0.7094,0.519,0.5219,0.236
ada,Ada Boost Classifier,0.7783,0.8249,0.6962,0.7408,0.7122,0.5331,0.5393,0.193
nb,Naive Bayes,0.7486,0.8064,0.7192,0.675,0.6911,0.48,0.4876,0.204


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [17]:
# Avaliar o modelo no conjunto de teste
predictions = predict_model(best_model, data=test_data)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7596,0.8269,0.6076,0.7164,0.6575,0.4743,0.4781


In [18]:
# Visualizar as predições
print(predictions)

    Pclass     Sex   Age  SibSp  Parch        Fare Embarked Survived  \
2        3  female  26.0      0      0    7.925000        S        1   
3        1  female  35.0      1      0   53.099998        S        1   
9        2  female  14.0      1      0  300.708008        C        1   
11       1  female  58.0      0      0   26.549999        S        1   
15       2  female  55.0      0      0   16.000000        S        1   
..     ...     ...   ...    ...    ...         ...      ...      ...   
873      3    male  47.0      0      0    9.000000        S        0   
882      3  female  22.0      0      0  105.167000        S        0   
884      3    male  25.0      0      0    7.050000        S        0   
886      2    male  27.0      0      0   13.000000        S        0   
889      1    male  26.0      0      0   30.000000        C        1   

     prediction_label  prediction_score  
2                   1            0.5954  
3                   1            0.9802  
9        

In [19]:
# Criar um modelo MLP (Redes Neurais com Multilayer Perceptron)
mlp = create_model('mlp')

# Treinar o modelo MLP
trained_model = tune_model(mlp)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6765,0.6007,0.4615,0.6,0.5217,0.2835,0.2891
1,0.8529,0.8718,0.8462,0.7857,0.8148,0.6931,0.6944
2,0.7353,0.8278,0.6923,0.6429,0.6667,0.4477,0.4485
3,0.8824,0.9321,0.7857,0.9167,0.8462,0.7518,0.7577
4,0.8235,0.8964,0.6429,0.9,0.75,0.6194,0.6404
5,0.7059,0.7286,0.5714,0.6667,0.6154,0.3796,0.3825
6,0.8235,0.875,0.7857,0.7857,0.7857,0.6357,0.6357
7,0.8529,0.8821,0.8571,0.8,0.8276,0.6996,0.7009
8,0.7273,0.8038,0.6154,0.6667,0.64,0.4211,0.4219
9,0.8485,0.95,0.9231,0.75,0.8276,0.695,0.7069


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7059,0.685,0.6154,0.6154,0.6154,0.3773,0.3773
1,0.8529,0.8791,0.6923,0.9,0.7826,0.6743,0.6876
2,0.7353,0.8132,0.6923,0.6429,0.6667,0.4477,0.4485
3,0.8824,0.9464,0.7857,0.9167,0.8462,0.7518,0.7577
4,0.7941,0.8821,0.5714,0.8889,0.6957,0.5509,0.5817
5,0.7647,0.7714,0.5714,0.8,0.6667,0.4925,0.5092
6,0.8529,0.8857,0.8571,0.8,0.8276,0.6996,0.7009
7,0.8529,0.8821,0.8571,0.8,0.8276,0.6996,0.7009
8,0.697,0.7692,0.6154,0.6154,0.6154,0.3654,0.3654
9,0.8788,0.9577,0.8462,0.8462,0.8462,0.7462,0.7462


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,MLP Classifier,0.7692,0.7964,0.5949,0.746,0.662,0.4901,0.4974


    Pclass     Sex   Age  SibSp  Parch        Fare Embarked Survived  \
2        3  female  26.0      0      0    7.925000        S        1   
3        1  female  35.0      1      0   53.099998        S        1   
9        2  female  14.0      1      0  300.708008        C        1   
11       1  female  58.0      0      0   26.549999        S        1   
15       2  female  55.0      0      0   16.000000        S        1   
..     ...     ...   ...    ...    ...         ...      ...      ...   
873      3    male  47.0      0      0    9.000000        S        0   
882      3  female  22.0      0      0  105.167000        S        0   
884      3    male  25.0      0      0    7.050000        S        0   
886      2    male  27.0      0      0   13.000000        S        0   
889      1    male  26.0      0      0   30.000000        C        1   

     prediction_label  prediction_score  
2                   1            0.6710  
3                   1            0.9189  
9        

In [20]:
# Avaliar o modelo no conjunto de teste
predictions = predict_model(trained_model, data=test_data)

# Visualizar as predições
print(predictions)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,MLP Classifier,0.7692,0.7964,0.5949,0.746,0.662,0.4901,0.4974


    Pclass     Sex   Age  SibSp  Parch        Fare Embarked Survived  \
2        3  female  26.0      0      0    7.925000        S        1   
3        1  female  35.0      1      0   53.099998        S        1   
9        2  female  14.0      1      0  300.708008        C        1   
11       1  female  58.0      0      0   26.549999        S        1   
15       2  female  55.0      0      0   16.000000        S        1   
..     ...     ...   ...    ...    ...         ...      ...      ...   
873      3    male  47.0      0      0    9.000000        S        0   
882      3  female  22.0      0      0  105.167000        S        0   
884      3    male  25.0      0      0    7.050000        S        0   
886      2    male  27.0      0      0   13.000000        S        0   
889      1    male  26.0      0      0   30.000000        C        1   

     prediction_label  prediction_score  
2                   1            0.6710  
3                   1            0.9189  
9        