# PyCaret com features selecionadas pelo metodo da Matriz de Correlação

In [None]:
!pip install pycaret
from pycaret.classification import * 
import pandas as pd
from imblearn.over_sampling import SMOTE

In [10]:
# abrir o dataset
dataset = pd.read_csv('/content/hypothyroid.csv')
#transformando os dados em dados categoricos
for index in dataset.columns.values:
        dataset[index]= dataset[index].astype("category").cat.codes.values

#Processo de limpeza do dataset(removendo linhas com dados faltantes)
for i in dataset.columns.values:
    dataset.drop(dataset[dataset[i] == '?'].index, inplace=True)
dataset = dataset.drop('TBG', axis=1)

## Features selecionadas 

In [11]:
dataset = dataset[['T4U', 'T4U measured', 'FTI', 'FTI measured', 'T3', 'T3 measured', 'TT4', 'TT4 measured', 'TSH', 'TSH measured', 'binaryClass']]
dataset.head()

Unnamed: 0,T4U,T4U measured,FTI,FTI measured,T3,T3 measured,TT4,TT4 measured,TSH,TSH measured,binaryClass
0,72,1,10,1,27,1,28,1,110,1,1
1,146,0,234,0,22,1,3,1,195,1,1
2,48,1,22,1,69,0,10,1,103,1,1
3,146,0,234,0,20,1,83,1,21,1,1
4,44,1,199,1,12,1,201,1,77,1,1


In [12]:
#balancear os dados
sm = SMOTE(random_state=42, k_neighbors=5)
dataset_balanced, output_classe = sm.fit_resample(dataset[['T4U', 'T4U measured', 'FTI', 'FTI measured', 'T3', 'T3 measured', 'TT4', 'TT4 measured', 'TSH', 'TSH measured']], dataset['binaryClass'])

dataset_b= pd.DataFrame(dataset_balanced)
classes = pd.DataFrame(output_classe)

final_dataset = pd.concat([dataset_b, classes], axis=1)

final_dataset.head()

Unnamed: 0,T4U,T4U measured,FTI,FTI measured,T3,T3 measured,TT4,TT4 measured,TSH,TSH measured,binaryClass
0,72,1,10,1,27,1,28,1,110,1,1
1,146,0,234,0,22,1,3,1,195,1,1
2,48,1,22,1,69,0,10,1,103,1,1
3,146,0,234,0,20,1,83,1,21,1,1
4,44,1,199,1,12,1,201,1,77,1,1


In [15]:
train_size = 0.8
session = setup(data=final_dataset, target='TSH measured', train_size=train_size, data_split_shuffle=True, normalize = True)

Unnamed: 0,Description,Value
0,Session id,4334
1,Target,TSH measured
2,Target type,Binary
3,Original data shape,"(6962, 11)"
4,Transformed data shape,"(6962, 11)"
5,Transformed train set shape,"(5569, 11)"
6,Transformed test set shape,"(1393, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


In [16]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.082
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.083
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.743
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.087
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.393
svm,SVM - Linear Kernel,0.9998,0.0,1.0,0.9998,0.9999,0.9982,0.9982,0.069
xgboost,Extreme Gradient Boosting,0.9998,0.9999,0.9998,1.0,0.9999,0.9982,0.9982,0.201
lightgbm,Light Gradient Boosting Machine,0.9998,1.0,0.9998,1.0,0.9999,0.9982,0.9982,0.233
et,Extra Trees Classifier,0.9993,1.0,0.9994,0.9998,0.9996,0.9928,0.9929,0.712
qda,Quadratic Discriminant Analysis,0.9989,0.9915,1.0,0.9989,0.9994,0.989,0.9891,0.086


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [17]:
print(best_model)

GaussianNB(priors=None, var_smoothing=1e-09)


## Resultado: 
### 5 melhores classificadores: 
- Naive Bayes
- Decision Tree Classifier
- Random Forest Classifier
- Ada Boost Classifier
- Gradient Boosting Classifier