### Importações

In [11]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

### Carregamento dos Dados

In [12]:
data = pd.read_csv('thyroid-dataset.csv')
#print(data['target'].unique())
#data.info()

# Contagem das colunas com valores nulos
null_by_column = data.isnull().sum()
print(null_by_column)

age                       0
sex                     307
on_thyroxine              0
query_on_thyroxine        0
on_antithyroid_meds       0
sick                      0
pregnant                  0
thyroid_surgery           0
I131_treatment            0
query_hypothyroid         0
query_hyperthyroid        0
lithium                   0
goitre                    0
tumor                     0
hypopituitary             0
psych                     0
TSH_measured              0
TSH                     842
T3_measured               0
T3                     2604
TT4_measured              0
TT4                     442
T4U_measured              0
T4U                     809
FTI_measured              0
FTI                     802
TBG_measured              0
TBG                    8823
referral_source           0
target                    0
patient_id                0
dtype: int64


### Limpeza dos dados

In [13]:
# Remover as colunas com alta quantidade de valores nulos
#data = data.drop(columns=['TBG_measured', 'TBG'])
#data = data.drop(columns=['T3_measured', 'T3'])
data = data[['TSH', 'T3', 'TT4', 'T4U', 'FTI', 'target']]

# Remover todas as linhas com algum valor nulo
data.dropna(inplace=True)

# Remover valores diferentes para target
allowed_values = ['-', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
data = data[data['target'].isin(allowed_values)]

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4812 entries, 19 to 9141
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TSH     4812 non-null   float64
 1   T3      4812 non-null   float64
 2   TT4     4812 non-null   float64
 3   T4U     4812 non-null   float64
 4   FTI     4812 non-null   float64
 5   target  4812 non-null   object 
dtypes: float64(5), object(1)
memory usage: 263.2+ KB


### Análise dos Diagnósticos

In [14]:
# Mapeamento dos valores
# 0 - Sem diagnóstico
# 1 - Hipertireoidismo
# 2 - Hipotireoidismo 
target_mapping = {
    '-': 0,
    'A': 1, 'B': 1, 'C': 1, 'D': 1,
    'E': 2, 'F': 2, 'G': 2, 'H': 2
}

data['target'] = data['target'].map(target_mapping)

# Contagem dos valores disponíveis na coluna target
target_values = data['target'].value_counts()
target_values = target_values.sort_index()

print(target_values)

target
0    4296
1     126
2     390
Name: count, dtype: int64


### Treinamento e Teste

In [20]:
# Identificar as colunas categóricas
#categorical_columns = data.select_dtypes(include=['object']).columns

# Aplicar LabelEncoder nas colunas categóricas
#label_encoders = {}
#for col in categorical_columns:
#    le = LabelEncoder()
#    data[col] = le.fit_transform(data[col])
#    label_encoders[col] = le

# Separar as features (X) e o diagnóstico (y)
X = data.drop(columns=['target'])
y = data['target']

# Criar os conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print("Treinamento:", X_train.shape[0])
print("Teste:", X_test.shape[0])

# Cria o modelo SVM
model = SVC()

# Treina o modelo
model.fit(X_train, y_train)

# Faz previsões no conjunto de teste
y_pred = model.predict(X_test)

# Avalia o modelo
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Acurácia:", accuracy)
print("\nRelatório de Classificação:\n", classification_rep)
print("\nMatriz de Confusão:\n", conf_matrix)

Treinamento: 4378
Teste: 434
Acurácia: 0.9654377880184332

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       396
           1       1.00      0.71      0.83         7
           2       0.95      0.61      0.75        31

    accuracy                           0.97       434
   macro avg       0.97      0.77      0.85       434
weighted avg       0.97      0.97      0.96       434


Matriz de Confusão:
 [[395   0   1]
 [  2   5   0]
 [ 12   0  19]]
