# PyCaret com features selecionadas pelo metodo Recursive Feature Elimination (RFE)

In [None]:
!pip install pycaret
from pycaret.classification import * 
import pandas as pd
from imblearn.over_sampling import SMOTE
from IPython.display import display, Markdown

In [117]:
# criando lista com as 5, 8, 10 e 12 features que foram selecionadas
list_features = [['TSH measured', 'TSH', 'T3', 'TT4', 'FTI', 'binaryClass'],
                 ['age', 'TSH measured', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source', 'binaryClass'],
                 ['age', 'sex', 'on thyroxine', 'TSH measured', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source', 'binaryClass'],
                 ['age', 'sex', 'on thyroxine', 'query hypothyroid', 'TSH measured', 'TSH', 'T3 measured', 'T3', 'TT4', 'T4U', 'FTI', 'referral source', 'binaryClass']]

for features in list_features:
    # abrir o dataset
    dataset = pd.read_csv('/content/hypothyroid.csv')
    # transformando os dados em dados categoricos
    for index in dataset.columns.values:
        dataset[index] = dataset[index].astype("category").cat.codes.values

    # Processo de limpeza do dataset (removendo linhas com dados faltantes)
    for i in dataset.columns.values:
        dataset.drop(dataset[dataset[i] == '?'].index, inplace=True)
    dataset = dataset.drop('TBG', axis=1)

    output_label_dataset = dataset['binaryClass']
    dataset = dataset[features]
    dataset = dataset.drop(['binaryClass'], axis=1)

    # balancear os dados
    sm = SMOTE(random_state=42, k_neighbors=5)
    dataset_balanced, output_classe = sm.fit_resample(dataset, output_label_dataset)

    dataset_b = pd.DataFrame(dataset_balanced)
    classes = pd.DataFrame(output_classe)

    final_dataset = pd.concat([dataset_b, classes], axis=1)
    num = (len(features)-1)
    print('\n')
    texto = f'Resultados para {num} features: '
    texto_formatado = f'<h2>{texto}</h2>'
    display(Markdown(texto_formatado))
    print(final_dataset.head())
    print('\n')

    train_size = 0.8
    session = setup(data=final_dataset, target='binaryClass', train_size=train_size, data_split_shuffle=True, normalize = True)
    
    best_model = compare_models()
    text = f'O melhor classificador: \n{best_model}'
    text_formatado = f'<h2>{text}</h2>'
    display(Markdown(text_formatado))





<h2>Resultados para 5 features: </h2>

   TSH measured  TSH  T3  TT4  FTI  binaryClass
0             1  110  27   28   10            1
1             1  195  22    3  234            1
2             1  103  69   10   22            1
3             1   21  20   83  234            1
4             1   77  12  201  199            1




Unnamed: 0,Description,Value
0,Session id,430
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 6)"
4,Transformed data shape,"(6962, 6)"
5,Transformed train set shape,"(5569, 6)"
6,Transformed test set shape,"(1393, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9758,0.9932,0.9662,0.985,0.9755,0.9515,0.9517,0.312
lightgbm,Light Gradient Boosting Machine,0.9752,0.9934,0.9627,0.9875,0.9749,0.9504,0.9508,0.352
rf,Random Forest Classifier,0.9741,0.9946,0.9583,0.9897,0.9737,0.9483,0.9488,0.66
et,Extra Trees Classifier,0.9732,0.9954,0.958,0.9882,0.9728,0.9465,0.947,0.566
gbc,Gradient Boosting Classifier,0.9668,0.9893,0.9605,0.9728,0.9666,0.9336,0.9337,0.463
dt,Decision Tree Classifier,0.963,0.963,0.9565,0.9692,0.9628,0.926,0.9262,0.122
ada,Ada Boost Classifier,0.9539,0.9857,0.9465,0.9607,0.9535,0.9077,0.9079,0.316
knn,K Neighbors Classifier,0.9535,0.9802,0.9264,0.9795,0.9522,0.907,0.9084,0.241
ridge,Ridge Classifier,0.8149,0.0,0.7902,0.8312,0.8101,0.6297,0.6306,0.106
lda,Linear Discriminant Analysis,0.8149,0.9005,0.7902,0.8312,0.8101,0.6297,0.6306,0.196


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)</h2>





<h2>Resultados para 8 features: </h2>

   age  TSH measured  TSH  T3  TT4  T4U  FTI  referral source  binaryClass
0   34             1  110  27   28   72   10                1            1
1   15             1  195  22    3  146  234                4            1
2   40             1  103  69   10   48   22                4            1
3   67             1   21  20   83  146  234                4            1
4   67             1   77  12  201   44  199                3            1




Unnamed: 0,Description,Value
0,Session id,6285
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 9)"
4,Transformed data shape,"(6962, 9)"
5,Transformed train set shape,"(5569, 9)"
6,Transformed test set shape,"(1393, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9795,0.9944,0.9684,0.9905,0.9793,0.9591,0.9594,0.372
lightgbm,Light Gradient Boosting Machine,0.979,0.9945,0.968,0.9898,0.9787,0.958,0.9583,0.237
rf,Random Forest Classifier,0.9761,0.9968,0.9583,0.9937,0.9757,0.9522,0.9529,0.585
et,Extra Trees Classifier,0.9732,0.9974,0.9623,0.9839,0.9729,0.9465,0.9468,0.715
gbc,Gradient Boosting Classifier,0.9666,0.9891,0.958,0.9749,0.9663,0.9332,0.9335,0.565
dt,Decision Tree Classifier,0.9641,0.9641,0.9587,0.9693,0.9639,0.9282,0.9284,0.138
ada,Ada Boost Classifier,0.958,0.9871,0.949,0.9665,0.9576,0.916,0.9163,0.506
knn,K Neighbors Classifier,0.9215,0.9745,0.8722,0.968,0.9174,0.8431,0.8474,0.168
lr,Logistic Regression,0.8278,0.9043,0.8122,0.8387,0.8249,0.6556,0.6564,0.149
ridge,Ridge Classifier,0.8255,0.0,0.7989,0.8441,0.8205,0.6509,0.6523,0.16


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)</h2>





<h2>Resultados para 10 features: </h2>

   age  sex  on thyroxine  TSH measured  TSH  T3  TT4  T4U  FTI  \
0   34    1             0             1  110  27   28   72   10   
1   15    1             0             1  195  22    3  146  234   
2   40    2             0             1  103  69   10   48   22   
3   67    1             1             1   21  20   83  146  234   
4   67    1             0             1   77  12  201   44  199   

   referral source  binaryClass  
0                1            1  
1                4            1  
2                4            1  
3                4            1  
4                3            1  




Unnamed: 0,Description,Value
0,Session id,500
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 11)"
4,Transformed data shape,"(6962, 11)"
5,Transformed train set shape,"(5569, 11)"
6,Transformed test set shape,"(1393, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9867,0.9975,0.981,0.9924,0.9866,0.9734,0.9735,0.26
xgboost,Extreme Gradient Boosting,0.9849,0.9976,0.9788,0.991,0.9848,0.9698,0.97,0.397
rf,Random Forest Classifier,0.9824,0.9981,0.9724,0.9923,0.9822,0.9648,0.965,0.632
et,Extra Trees Classifier,0.982,0.9982,0.9752,0.9887,0.9819,0.9641,0.9642,0.889
gbc,Gradient Boosting Classifier,0.9783,0.9948,0.9749,0.9816,0.9782,0.9566,0.9566,0.588
dt,Decision Tree Classifier,0.9707,0.9707,0.9655,0.9758,0.9706,0.9415,0.9416,0.147
ada,Ada Boost Classifier,0.9634,0.9921,0.9569,0.9696,0.9631,0.9267,0.9269,0.527
knn,K Neighbors Classifier,0.923,0.974,0.8776,0.9652,0.9192,0.8459,0.8496,0.182
svm,SVM - Linear Kernel,0.8323,0.0,0.8004,0.8565,0.8267,0.6646,0.6671,0.176
lr,Logistic Regression,0.8291,0.9195,0.8183,0.8368,0.8272,0.6581,0.6586,0.154


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=500, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)</h2>





<h2>Resultados para 12 features: </h2>

   age  sex  on thyroxine  query hypothyroid  TSH measured  TSH  T3 measured  \
0   34    1             0                  0             1  110            1   
1   15    1             0                  0             1  195            1   
2   40    2             0                  0             1  103            0   
3   67    1             1                  0             1   21            1   
4   67    1             0                  0             1   77            1   

   T3  TT4  T4U  FTI  referral source  binaryClass  
0  27   28   72   10                1            1  
1  22    3  146  234                4            1  
2  69   10   48   22                4            1  
3  20   83  146  234                4            1  
4  12  201   44  199                3            1  




Unnamed: 0,Description,Value
0,Session id,3768
1,Target,binaryClass
2,Target type,Binary
3,Original data shape,"(6962, 13)"
4,Transformed data shape,"(6962, 13)"
5,Transformed train set shape,"(5569, 13)"
6,Transformed test set shape,"(1393, 13)"
7,Numeric features,12
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9876,0.9981,0.9813,0.9939,0.9875,0.9752,0.9754,0.429
lightgbm,Light Gradient Boosting Machine,0.9871,0.9981,0.9821,0.992,0.987,0.9741,0.9743,0.351
rf,Random Forest Classifier,0.981,0.9982,0.9695,0.9923,0.9807,0.9619,0.9623,0.632
et,Extra Trees Classifier,0.9772,0.9979,0.9684,0.9858,0.977,0.9544,0.9546,0.757
gbc,Gradient Boosting Classifier,0.9758,0.9952,0.9713,0.9801,0.9756,0.9515,0.9516,0.598
dt,Decision Tree Classifier,0.9723,0.9723,0.9673,0.9772,0.9722,0.9447,0.9448,0.152
ada,Ada Boost Classifier,0.967,0.9931,0.9619,0.9717,0.9668,0.9339,0.934,0.524
knn,K Neighbors Classifier,0.923,0.9698,0.875,0.9678,0.9191,0.8459,0.8499,0.204
lr,Logistic Regression,0.858,0.9421,0.851,0.8635,0.857,0.7159,0.7163,0.166
svm,SVM - Linear Kernel,0.8572,0.0,0.8405,0.8706,0.8548,0.7145,0.7157,0.201


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

<h2>O melhor classificador: 
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)</h2>

# 5 melhores modelos para as diferentes quantidades de features

### 5 features: 
- Extreme Gradient Boosting
- Light Gradient Boosting Machine
- Random Forest Classifier
- Extra Trees Classifier
- Gradient Boosting Classifier

### 8 Features: 
- Extreme Gradient Boosting
- Light Gradient Boosting Machine
- Random Forest Classifier
- Extra Trees Classifier
- Gradient Boosting Classifier

### 10 Features: 
- Light Gradient Boosting Machine
- Extreme Gradient Boosting
- Random Forest Classifier
- Extra Trees Classifier
- Gradient Boosting Classifier

### 12 Features: 
- Extreme Gradient Boosting
- Light Gradient Boosting Machine
- Random Forest Classifier
- Extra Trees Classifier
- Gradient Boosting Classifier