## PyCaret com features selecionadas pelo metodo Recursive Feature Elimination (RFE)

In [None]:
!pip install pycaret
from pycaret.classification import * 
import pandas as pd
from imblearn.over_sampling import SMOTE

In [None]:
# abrir o dataset
dataset = pd.read_csv('/content/hypothyroid.csv')
#transformando os dados em dados categoricos
for index in dataset.columns.values:
        dataset[index]= dataset[index].astype("category").cat.codes.values

#Processo de limpeza do dataset(removendo linhas com dados faltantes)
for i in dataset.columns.values:
    dataset.drop(dataset[dataset[i] == '?'].index, inplace=True)
dataset = dataset.drop('TBG', axis=1)
#dividindo o dataset
output_label_dataset = dataset['binaryClass']
#dados que serão introduzidos no modelo
dataset = dataset.drop(['binaryClass'], axis=1)

## 5 features: 

In [None]:
dataset = dataset[['TSH measured', 'TSH', 'T3', 'TT4', 'FTI']]
dataset.head()

Unnamed: 0,TSH measured,TSH,T3,TT4,FTI
0,1,110,27,28,10
1,1,195,22,3,234
2,1,103,69,10,22
3,1,21,20,83,234
4,1,77,12,201,199


In [None]:
#balancear os dados
sm = SMOTE(random_state=42, k_neighbors=5)
dataset_balanced, output_classe = sm.fit_resample(dataset[['TSH', 'T3', 'TT4', 'FTI']], dataset['TSH measured'])

dataset_b= pd.DataFrame(dataset_balanced)
classes = pd.DataFrame(output_classe)

final_dataset = pd.concat([dataset_b, classes], axis=1)

final_dataset.head()

Unnamed: 0,TSH,T3,TT4,FTI,TSH measured
0,110,27,28,10,1
1,195,22,3,234,1
2,103,69,10,22,1
3,21,20,83,234,1
4,77,12,201,199,1


In [None]:
train_size = 0.8
session = setup(data=final_dataset, target='TSH measured', train_size=train_size, data_split_shuffle=True, normalize = True)

Unnamed: 0,Description,Value
0,Session id,5235
1,Target,TSH measured
2,Target type,Binary
3,Original data shape,"(6806, 5)"
4,Transformed data shape,"(6806, 5)"
5,Transformed train set shape,"(5444, 5)"
6,Transformed test set shape,"(1362, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.064
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.065
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.494
qda,Quadratic Discriminant Analysis,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.068
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.068
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.284
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.136
et,Extra Trees Classifier,0.9998,1.0,0.9996,1.0,0.9998,0.9996,0.9996,0.68
lightgbm,Light Gradient Boosting Machine,0.9996,0.9999,0.9993,1.0,0.9996,0.9993,0.9993,0.184
svm,SVM - Linear Kernel,0.9958,0.0,0.9915,1.0,0.9958,0.9915,0.9916,0.059


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
print(best_model)

GaussianNB(priors=None, var_smoothing=1e-09)


## 8 Features:

In [None]:
dataset = dataset[['age', 'TSH measured', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source']]
dataset.head()

Unnamed: 0,age,TSH measured,TSH,T3,TT4,T4U,FTI,referral source
0,34,1,110,27,28,72,10,1
1,15,1,195,22,3,146,234,4
2,40,1,103,69,10,48,22,4
3,67,1,21,20,83,146,234,4
4,67,1,77,12,201,44,199,3


In [None]:
#balancear os dados
sm = SMOTE(random_state=42, k_neighbors=5)
dataset_balanced, output_classe = sm.fit_resample(dataset[['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source']], dataset['TSH measured'])

dataset_b= pd.DataFrame(dataset_balanced)
classes = pd.DataFrame(output_classe)

final_dataset = pd.concat([dataset_b, classes], axis=1)

final_dataset.head()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,referral source,TSH measured
0,34,110,27,28,72,10,1,1
1,15,195,22,3,146,234,4,1
2,40,103,69,10,48,22,4,1
3,67,21,20,83,146,234,4,1
4,67,77,12,201,44,199,3,1


In [None]:
train_size = 0.8
session = setup(data=final_dataset, target='TSH measured', train_size=train_size, data_split_shuffle=True, normalize = True)

Unnamed: 0,Description,Value
0,Session id,1832
1,Target,TSH measured
2,Target type,Binary
3,Original data shape,"(6806, 8)"
4,Transformed data shape,"(6806, 8)"
5,Transformed train set shape,"(5444, 8)"
6,Transformed test set shape,"(1362, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.212
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.208
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.564
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.209
rf,Random Forest Classifier,0.9998,1.0,0.9996,1.0,0.9998,0.9996,0.9996,0.666
lightgbm,Light Gradient Boosting Machine,0.9996,1.0,0.9996,0.9996,0.9996,0.9993,0.9993,0.297
et,Extra Trees Classifier,0.9994,1.0,0.9989,1.0,0.9994,0.9989,0.9989,1.009
qda,Quadratic Discriminant Analysis,0.9972,0.9996,1.0,0.9945,0.9973,0.9945,0.9945,0.16
svm,SVM - Linear Kernel,0.9967,0.0,0.9934,1.0,0.9967,0.9934,0.9934,0.092
nb,Naive Bayes,0.9958,0.9994,1.0,0.9917,0.9958,0.9915,0.9916,0.194


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
print(best_model)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       random_state=1832, splitter='best')


## 10 Features:

In [None]:
dataset = dataset[['age', 'sex', 'on thyroxine', 'TSH measured', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source']]
dataset.head()

Unnamed: 0,age,sex,on thyroxine,TSH measured,TSH,T3,TT4,T4U,FTI,referral source
0,34,1,0,1,110,27,28,72,10,1
1,15,1,0,1,195,22,3,146,234,4
2,40,2,0,1,103,69,10,48,22,4
3,67,1,1,1,21,20,83,146,234,4
4,67,1,0,1,77,12,201,44,199,3


In [None]:
#balancear os dados
sm = SMOTE(random_state=42, k_neighbors=5)
dataset_balanced, output_classe = sm.fit_resample(dataset[['age', 'sex', 'on thyroxine', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'referral source']], dataset['TSH measured'])

dataset_b= pd.DataFrame(dataset_balanced)
classes = pd.DataFrame(output_classe)

final_dataset = pd.concat([dataset_b, classes], axis=1)

final_dataset.head()

Unnamed: 0,age,sex,on thyroxine,TSH,T3,TT4,T4U,FTI,referral source,TSH measured
0,34,1,0,110,27,28,72,10,1,1
1,15,1,0,195,22,3,146,234,4,1
2,40,2,0,103,69,10,48,22,4,1
3,67,1,1,21,20,83,146,234,4,1
4,67,1,0,77,12,201,44,199,3,1


In [None]:
train_size = 0.8
session = setup(data=final_dataset, target='TSH measured', train_size=train_size, data_split_shuffle=True, normalize = True)

Unnamed: 0,Description,Value
0,Session id,2433
1,Target,TSH measured
2,Target type,Binary
3,Original data shape,"(6806, 10)"
4,Transformed data shape,"(6806, 10)"
5,Transformed train set shape,"(5444, 10)"
6,Transformed test set shape,"(1362, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.222
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.702
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.214
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.607
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.275
lightgbm,Light Gradient Boosting Machine,0.9998,0.9998,0.9996,1.0,0.9998,0.9996,0.9996,0.368
et,Extra Trees Classifier,0.9989,1.0,0.9982,0.9996,0.9989,0.9978,0.9978,0.909
qda,Quadratic Discriminant Analysis,0.9974,0.9996,1.0,0.9949,0.9974,0.9949,0.9949,0.16
svm,SVM - Linear Kernel,0.9956,0.0,0.9938,0.9974,0.9956,0.9912,0.9912,0.143
lr,Logistic Regression,0.9919,0.9994,0.9846,0.9993,0.9918,0.9838,0.984,0.481


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
print(best_model)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       random_state=2433, splitter='best')


## 12 Features:

In [None]:
dataset = dataset[['age', 'sex', 'on thyroxine', 'query hypothyroid', 'TSH measured', 'TSH', 'T3 measured', 'T3', 'TT4', 'T4U', 'FTI', 'referral source']]
dataset.head()

Unnamed: 0,age,sex,on thyroxine,query hypothyroid,TSH measured,TSH,T3 measured,T3,TT4,T4U,FTI,referral source
0,34,1,0,0,1,110,1,27,28,72,10,1
1,15,1,0,0,1,195,1,22,3,146,234,4
2,40,2,0,0,1,103,0,69,10,48,22,4
3,67,1,1,0,1,21,1,20,83,146,234,4
4,67,1,0,0,1,77,1,12,201,44,199,3


In [None]:
#balancear os dados
sm = SMOTE(random_state=42, k_neighbors=5)
dataset_balanced, output_classe = sm.fit_resample(dataset[['age', 'sex', 'on thyroxine', 'query hypothyroid', 'TSH', 'T3 measured', 'T3', 'TT4', 'T4U', 'FTI', 'referral source']], dataset['TSH measured'])

dataset_b= pd.DataFrame(dataset_balanced)
classes = pd.DataFrame(output_classe)

final_dataset = pd.concat([dataset_b, classes], axis=1)

final_dataset.head()

Unnamed: 0,age,sex,on thyroxine,query hypothyroid,TSH,T3 measured,T3,TT4,T4U,FTI,referral source,TSH measured
0,34,1,0,0,110,1,27,28,72,10,1,1
1,15,1,0,0,195,1,22,3,146,234,4,1
2,40,2,0,0,103,0,69,10,48,22,4,1
3,67,1,1,0,21,1,20,83,146,234,4,1
4,67,1,0,0,77,1,12,201,44,199,3,1


In [None]:
train_size = 0.8
session = setup(data=final_dataset, target='TSH measured', train_size=train_size, data_split_shuffle=True, normalize = True)

Unnamed: 0,Description,Value
0,Session id,2694
1,Target,TSH measured
2,Target type,Binary
3,Original data shape,"(6806, 12)"
4,Transformed data shape,"(6806, 12)"
5,Transformed train set shape,"(5444, 12)"
6,Transformed test set shape,"(1362, 12)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


In [None]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.9998,0.9998,0.9996,1.0,0.9998,0.9996,0.9996,0.122
ada,Ada Boost Classifier,0.9998,0.9998,0.9996,1.0,0.9998,0.9996,0.9996,0.124
gbc,Gradient Boosting Classifier,0.9998,0.9998,0.9996,1.0,0.9998,0.9996,0.9996,0.753
xgboost,Extreme Gradient Boosting,0.9998,0.9998,0.9996,1.0,0.9998,0.9996,0.9996,0.392
rf,Random Forest Classifier,0.9994,1.0,0.9989,1.0,0.9994,0.9989,0.9989,0.721
lightgbm,Light Gradient Boosting Machine,0.9993,0.9996,0.9989,0.9996,0.9993,0.9985,0.9985,0.449
et,Extra Trees Classifier,0.9987,1.0,0.9982,0.9993,0.9987,0.9974,0.9974,0.64
svm,SVM - Linear Kernel,0.9956,0.0,0.9923,0.9989,0.9956,0.9912,0.9912,0.174
qda,Quadratic Discriminant Analysis,0.995,0.9983,1.0,0.9903,0.9951,0.9901,0.9902,0.125
lr,Logistic Regression,0.9923,0.9996,0.986,0.9985,0.9922,0.9846,0.9847,0.645


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
print(best_model)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       random_state=2694, splitter='best')


## Resultados

### 5 melhores classificadores:
 
- 5 features: (Naive Bayes, Decision Tree Classifier, Random Forest Classifier, Quadratic Discriminant Analysis, Ada Boost Classifier);

- 8 Features: (Decision Tree Classifier, Ada Boost Classifier, Gradient Boosting Classifier, Extreme Gradient Boosting, Random Forest Classifier);

- 10 Features: (Decision Tree Classifier, Random Forest Classifier, Ada Boost Classifier, Gradient Boosting Classifier, Extreme Gradient Boosting);

- 12 Features: (Decision Tree Classifier, Ada Boost Classifier, Gradient Boosting Classifier, Extreme Gradient Boosting, Random Forest Classifier).