# PyCaret com features selecionadas pelo metodo Recursive Feature Elimination (RFE)

In [None]:
!pip install pycaret
from pycaret.classification import *
import pandas as pd
from imblearn.over_sampling import SMOTE
from IPython.display import display, Markdown

In [9]:
# criando lista com as 5, 8, 10 e 12 features que foram selecionadas
list_features = [['TSH', 'FTI'],
                 ['TSH', 'T3', 'FTI'],
                 ['TSH measured', 'TSH', 'T3', 'TT4', 'FTI'],
                 ['age', 'TSH measured', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source'],
                 ['age', 'sex', 'on thyroxine', 'TSH measured', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source'],
                 ['age', 'sex', 'on thyroxine', 'query hypothyroid', 'TSH measured', 'TSH', 'T3 measured', 'T3', 'TT4', 'T4U', 'FTI', 'referral source']]

for features in list_features:
    # abrir o dataset
    dataset = pd.read_csv('/content/hypothyroid.csv')
    # transformando os dados em dados categoricos
    for index in dataset.columns.values:
        dataset[index] = dataset[index].astype("category").cat.codes.values

    # Processo de limpeza do dataset (removendo linhas com dados faltantes)
    for i in dataset.columns.values:
        dataset.drop(dataset[dataset[i] == '?'].index, inplace=True)
    dataset = dataset.drop('TBG', axis=1)

    output_label_dataset = dataset['binaryClass']
    dataset = dataset[features]
    # dataset = dataset.drop(['binaryClass'], axis=1)

    # balancear os dados
    sm = SMOTE(random_state=42, k_neighbors=5)
    dataset_balanced, output_classe = sm.fit_resample(dataset, output_label_dataset)

    dataset_b = pd.DataFrame(dataset_balanced)
    classes = pd.DataFrame(output_classe)

    final_dataset = pd.concat([dataset_b, classes], axis=1)
    num = (len(features))
    print('\n')
    texto = f'Resultados para {num} features: '
    texto_formatado = f'<h2>{texto}</h2>'
    display(Markdown(texto_formatado))
    print(final_dataset.head())
    print('\n')

    train_size = 0.8
    session = setup(data=final_dataset, target='binaryClass', train_size=train_size, data_split_shuffle=True, normalize = True)

    best_model = compare_models()
    text = f'O melhor classificador: \n{best_model}'
    text_formatado = f'<h2>{text}</h2>'
    display(Markdown(text_formatado))





<h2>Resultados para 2 features: </h2>

   TSH  FTI  binaryClass
0  110   10            1
1  195  234            1
2  103   22            1
3   21  234            1
4   77  199            1




Unnamed: 0,Description,Value
0,Session id,2552
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 3)"
4,Transformed data shape,"(6962, 3)"
5,Transformed train set shape,"(5569, 3)"
6,Transformed test set shape,"(1393, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9752,0.9907,0.9691,0.9812,0.9751,0.9504,0.9506,0.942
xgboost,Extreme Gradient Boosting,0.9747,0.9903,0.9673,0.9819,0.9745,0.9494,0.9496,0.312
gbc,Gradient Boosting Classifier,0.9745,0.9885,0.9677,0.9811,0.9743,0.949,0.9492,0.359
rf,Random Forest Classifier,0.9693,0.9899,0.9591,0.9792,0.969,0.9386,0.9389,0.473
knn,K Neighbors Classifier,0.9686,0.9847,0.9612,0.9757,0.9683,0.9372,0.9373,0.155
et,Extra Trees Classifier,0.968,0.9871,0.9591,0.9767,0.9678,0.9361,0.9363,0.429
dt,Decision Tree Classifier,0.9668,0.9702,0.9591,0.9742,0.9665,0.9336,0.9338,0.097
ada,Ada Boost Classifier,0.9639,0.9861,0.9469,0.9804,0.9633,0.9278,0.9285,0.388
nb,Naive Bayes,0.7782,0.8058,0.7285,0.8089,0.7666,0.5565,0.5593,0.139
qda,Quadratic Discriminant Analysis,0.7707,0.8226,0.7131,0.8059,0.7566,0.5414,0.5451,0.082


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=2552, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)</h2>





<h2>Resultados para 3 features: </h2>

   TSH  T3  FTI  binaryClass
0  110  27   10            1
1  195  22  234            1
2  103  69   22            1
3   21  20  234            1
4   77  12  199            1




Unnamed: 0,Description,Value
0,Session id,8677
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 4)"
4,Transformed data shape,"(6962, 4)"
5,Transformed train set shape,"(5569, 4)"
6,Transformed test set shape,"(1393, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9732,0.9924,0.967,0.9793,0.9731,0.9465,0.9467,0.796
xgboost,Extreme Gradient Boosting,0.9729,0.9919,0.9652,0.9804,0.9727,0.9458,0.946,0.24
gbc,Gradient Boosting Classifier,0.9714,0.9908,0.9644,0.9782,0.9712,0.9429,0.9431,0.382
dt,Decision Tree Classifier,0.9657,0.9659,0.9609,0.9703,0.9655,0.9314,0.9315,0.093
et,Extra Trees Classifier,0.9653,0.992,0.9544,0.9759,0.965,0.9307,0.931,0.591
rf,Random Forest Classifier,0.9644,0.9924,0.9508,0.9775,0.9639,0.9289,0.9293,0.645
ada,Ada Boost Classifier,0.9592,0.9882,0.9508,0.9673,0.9589,0.9185,0.9188,0.276
knn,K Neighbors Classifier,0.9583,0.9843,0.94,0.9758,0.9575,0.9167,0.9175,0.114
qda,Quadratic Discriminant Analysis,0.7775,0.8566,0.7853,0.7744,0.7794,0.555,0.5557,0.097
nb,Naive Bayes,0.7739,0.8386,0.744,0.7919,0.7669,0.5479,0.5492,0.094


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=8677, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)</h2>





<h2>Resultados para 5 features: </h2>

   TSH measured  TSH  T3  TT4  FTI  binaryClass
0             1  110  27   28   10            1
1             1  195  22    3  234            1
2             1  103  69   10   22            1
3             1   21  20   83  234            1
4             1   77  12  201  199            1




Unnamed: 0,Description,Value
0,Session id,231
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 6)"
4,Transformed data shape,"(6962, 6)"
5,Transformed train set shape,"(5569, 6)"
6,Transformed test set shape,"(1393, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9774,0.9935,0.9677,0.9869,0.9771,0.9548,0.955,0.768
xgboost,Extreme Gradient Boosting,0.977,0.9929,0.9688,0.9851,0.9768,0.954,0.9542,0.445
rf,Random Forest Classifier,0.9756,0.9953,0.9619,0.9891,0.9752,0.9512,0.9517,0.567
et,Extra Trees Classifier,0.9754,0.996,0.9623,0.9883,0.9751,0.9508,0.9512,0.493
gbc,Gradient Boosting Classifier,0.9686,0.9898,0.963,0.974,0.9684,0.9372,0.9373,0.593
dt,Decision Tree Classifier,0.9619,0.9619,0.9544,0.9691,0.9616,0.9239,0.924,0.111
ada,Ada Boost Classifier,0.9562,0.9863,0.9508,0.9613,0.956,0.9124,0.9125,0.3
knn,K Neighbors Classifier,0.954,0.9819,0.926,0.981,0.9526,0.9081,0.9097,0.131
ridge,Ridge Classifier,0.817,0.0,0.7924,0.8337,0.8124,0.6341,0.6351,0.159
lda,Linear Discriminant Analysis,0.817,0.9004,0.7924,0.8337,0.8124,0.6341,0.6351,0.11


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=231, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)</h2>





<h2>Resultados para 8 features: </h2>

   age  TSH measured  TSH  T3  TT4  T4U  FTI  referral source  binaryClass
0   34             1  110  27   28   72   10                1            1
1   15             1  195  22    3  146  234                4            1
2   40             1  103  69   10   48   22                4            1
3   67             1   21  20   83  146  234                4            1
4   67             1   77  12  201   44  199                3            1




Unnamed: 0,Description,Value
0,Session id,929
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 9)"
4,Transformed data shape,"(6962, 9)"
5,Transformed train set shape,"(5569, 9)"
6,Transformed test set shape,"(1393, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.981,0.9955,0.9702,0.9916,0.9807,0.9619,0.9622,0.521
lightgbm,Light Gradient Boosting Machine,0.9804,0.9952,0.9691,0.9915,0.9802,0.9609,0.9611,0.855
et,Extra Trees Classifier,0.9777,0.9979,0.9659,0.9894,0.9774,0.9555,0.9558,0.514
rf,Random Forest Classifier,0.9763,0.9972,0.958,0.9944,0.9758,0.9526,0.9533,0.722
dt,Decision Tree Classifier,0.9716,0.9716,0.9627,0.9803,0.9714,0.9433,0.9435,0.121
gbc,Gradient Boosting Classifier,0.9704,0.9903,0.9605,0.9799,0.9701,0.9407,0.941,0.687
ada,Ada Boost Classifier,0.9594,0.9877,0.9519,0.9666,0.9591,0.9188,0.9191,0.323
knn,K Neighbors Classifier,0.9267,0.9745,0.8772,0.9738,0.9229,0.8535,0.8579,0.156
lr,Logistic Regression,0.8291,0.9091,0.8137,0.84,0.8263,0.6581,0.6588,0.21
ridge,Ridge Classifier,0.828,0.0,0.7993,0.8485,0.8228,0.6559,0.6575,0.117


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)</h2>





<h2>Resultados para 10 features: </h2>

   age  sex  on thyroxine  TSH measured  TSH  T3  TT4  T4U  FTI  \
0   34    1             0             1  110  27   28   72   10   
1   15    1             0             1  195  22    3  146  234   
2   40    2             0             1  103  69   10   48   22   
3   67    1             1             1   21  20   83  146  234   
4   67    1             0             1   77  12  201   44  199   

   referral source  binaryClass  
0                1            1  
1                4            1  
2                4            1  
3                4            1  
4                3            1  




Unnamed: 0,Description,Value
0,Session id,3336
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 11)"
4,Transformed data shape,"(6962, 11)"
5,Transformed train set shape,"(5569, 11)"
6,Transformed test set shape,"(1393, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9878,0.9978,0.9828,0.9928,0.9877,0.9756,0.9757,0.381
lightgbm,Light Gradient Boosting Machine,0.9878,0.9978,0.9835,0.992,0.9877,0.9756,0.9756,1.061
rf,Random Forest Classifier,0.9844,0.9986,0.9752,0.9934,0.9842,0.9688,0.9689,0.781
et,Extra Trees Classifier,0.9837,0.9982,0.977,0.9902,0.9835,0.9673,0.9675,0.671
gbc,Gradient Boosting Classifier,0.9783,0.9955,0.9734,0.983,0.9781,0.9565,0.9566,0.557
dt,Decision Tree Classifier,0.9731,0.9731,0.9648,0.9811,0.9728,0.9461,0.9464,0.136
ada,Ada Boost Classifier,0.9716,0.9938,0.968,0.9751,0.9715,0.9433,0.9434,0.424
knn,K Neighbors Classifier,0.9248,0.9714,0.88,0.9668,0.9211,0.8495,0.8533,0.203
lr,Logistic Regression,0.8359,0.9229,0.8254,0.8432,0.8341,0.6718,0.6721,0.146
svm,SVM - Linear Kernel,0.8346,0.0,0.828,0.8402,0.8335,0.6693,0.6702,0.247


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)</h2>





<h2>Resultados para 12 features: </h2>

   age  sex  on thyroxine  query hypothyroid  TSH measured  TSH  T3 measured  \
0   34    1             0                  0             1  110            1   
1   15    1             0                  0             1  195            1   
2   40    2             0                  0             1  103            0   
3   67    1             1                  0             1   21            1   
4   67    1             0                  0             1   77            1   

   T3  TT4  T4U  FTI  referral source  binaryClass  
0  27   28   72   10                1            1  
1  22    3  146  234                4            1  
2  69   10   48   22                4            1  
3  20   83  146  234                4            1  
4  12  201   44  199                3            1  




Unnamed: 0,Description,Value
0,Session id,1388
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 13)"
4,Transformed data shape,"(6962, 13)"
5,Transformed train set shape,"(5569, 13)"
6,Transformed test set shape,"(1393, 13)"
7,Numeric features,12
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9878,0.9984,0.9838,0.9917,0.9877,0.9756,0.9757,0.708
lightgbm,Light Gradient Boosting Machine,0.9878,0.9985,0.9835,0.992,0.9877,0.9756,0.9756,0.753
rf,Random Forest Classifier,0.9829,0.9987,0.9727,0.993,0.9828,0.9659,0.9661,0.747
et,Extra Trees Classifier,0.9797,0.9984,0.9709,0.9884,0.9795,0.9594,0.9596,0.535
gbc,Gradient Boosting Classifier,0.9768,0.9958,0.9723,0.9812,0.9767,0.9537,0.9537,0.725
dt,Decision Tree Classifier,0.9761,0.9761,0.972,0.9802,0.976,0.9522,0.9523,0.134
ada,Ada Boost Classifier,0.9677,0.9933,0.9627,0.9725,0.9675,0.9354,0.9355,0.341
knn,K Neighbors Classifier,0.9217,0.9721,0.8732,0.967,0.9176,0.8434,0.8476,0.305
lr,Logistic Regression,0.861,0.9418,0.8545,0.8664,0.86,0.722,0.7228,0.171
ridge,Ridge Classifier,0.8538,0.0,0.8247,0.8764,0.8495,0.7077,0.7094,0.115


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)</h2>

# Resultados com os 5 melhores models

### 2 features:
- Light Gradient Boosting Machine
- Extreme Gradient Boosting
- Gradient Boosting Classifier
- Random Forest Classifier
- K Neighbors Classifier

### 3 features:
- Light Gradient Boosting Machine
- Extreme Gradient Boosting
- Gradient Boosting Classifier
- Decision Tree Classifier
- Extra Trees Classifier

### 5 features:
- Light Gradient Boosting Machine
- Extreme Gradient Boosting
- Random Forest Classifier
- Extra Trees Classifier
- Gradient Boosting Classifier

### 8 Features:
- Extreme Gradient Boosting
- Light Gradient Boosting Machine
- Extra Trees Classifier
- Random Forest Classifier
- Gradient Boosting Classifier

### 10 Features:
- Extreme Gradient Boosting
- Light Gradient Boosting Machine
- Random Forest Classifier
- Extra Trees Classifier
- Gradient Boosting Classifier

### 12 Features:
- Extreme Gradient Boosting
- Light Gradient Boosting Machine
- Random Forest Classifier
- Extra Trees Classifier
- Gradient Boosting Classifier