In [33]:
# Importação das blibiotecas
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [12]:
# Carregamento do arquivo .csv
df = pd.read_csv('hypothyroid.csv')

In [26]:
# visualização do dataset
df

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,binaryClass
0,41.0,0.0,0,0,0,0,0,0,0,0,...,1,125.0,1,1.14,1,109.0,0,,SVHC,
1,23.0,0.0,0,0,0,0,0,0,0,0,...,1,102.0,0,,0,,0,,other,
2,46.0,1.0,0,0,0,0,0,0,0,0,...,1,109.0,1,0.91,1,120.0,0,,other,
3,70.0,0.0,1,0,0,0,0,0,0,0,...,1,175.0,0,,0,,0,,other,
4,70.0,0.0,0,0,0,0,0,0,0,0,...,1,61.0,1,0.87,1,70.0,0,,SVI,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,0.0,0,0,0,0,0,0,0,0,...,0,,0,,0,,0,,other,
3768,68.0,0.0,0,0,0,0,0,0,0,0,...,1,124.0,1,1.08,1,114.0,0,,SVI,
3769,74.0,0.0,0,0,0,0,0,0,0,0,...,1,112.0,1,1.07,1,105.0,0,,other,
3770,72.0,1.0,0,0,0,0,0,0,0,0,...,1,82.0,1,0.94,1,87.0,0,,SVI,


**Informaçõesdo paciente**

* age: idade do paciente
* sex: sexo do paciente
  
**Histórico de medicamentos e condições**

* on thyroxine: tomando tiroxina (medicação usada para hipotireoidismo).
* query on thyroxine:  suspeita de que ela deveria estar tomando tiroxina.
* on antithyroid medication: está tomando remédio para hipertireoidismo.
* sick: está doente de outra coisa (não necessariamente tireoide).
* pregnant: está grávida.
* thyroid surgery:  já fez cirurgia de tireoide.
* I131 treatment: recebeu tratamento com iodo radioativo (I-131).
* query hypothyroid: suspeita clínica de hipotireoidismo.
* query hyperthyroid: suspeita clínica de hipertireoidismo.
* lithium: usa lítio (que afeta a tireoide).
* goitre: presença de bócio.
* tumor: presença de tumor relacionado.
* hypopituitary: possível problema na hipófise (que regula hormônios da tireoide).
* psych:  pessoa tem algum distúrbio psicológico relatado.

**Exames e suas medições**

* Com "Measure": Se o exame foi medido ou não
* Sem "Measure": Valor do exame e foi aplicado
* Os TSH, T3, TT4, T4U, FTI,TBG são referente as criação, quantidade e transporte de hormônias

**Locais aonde veis os examex**

* referral source: nos traz deonde vieram os exames

**Variavel Target**

* binaryClass: nossa variavel target que informa p para positivo com problema para tireoide e N para negativo


# Pre processamento

In [15]:
# Alteração do "?" po NaN
# Na pre avaliação os exames que não foram feito apresenta ? para informar que não teve valor

df = df.replace('?', np.nan)

In [17]:
# Alteração de T para 1 e F para 0 
tf_cols = [
    col for col in df.columns
    if df[col].dropna().isin(['t', 'f']).all()
]

for col in tf_cols:
    df[col] = df[col].map({'t': 1, 'f': 0})

In [22]:
# Alteração da coluna sex M para 1 e F para 0 
df["sex"] = df["sex"].map({'M': 1, 'F': 0})

In [24]:
# Alteração da coluna binaryClass nossa target  de P para 1 e N para 0
df["binaryClass"] = df["binaryClass"].map({'P': 1, 'N': 0})

In [None]:
# Alteração da colunas categoricas para numericas
num_cols = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG','sex','binaryClass']

for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [25]:
df.isna().sum().sort_values(ascending=False)

binaryClass                  3772
TBG                          3772
T3                            769
T4U                           387
FTI                           385
TSH                           369
TT4                           231
sex                           150
age                             1
pregnant                        0
on thyroxine                    0
query on thyroxine              0
on antithyroid medication       0
sick                            0
tumor                           0
goitre                          0
lithium                         0
query hyperthyroid              0
query hypothyroid               0
I131 treatment                  0
thyroid surgery                 0
hypopituitary                   0
TT4 measured                    0
T3 measured                     0
psych                           0
TSH measured                    0
FTI measured                    0
T4U measured                    0
TBG measured                    0
referral sourc

In [27]:
# Colunas que sex ou age que estão vazias não vai ser possivel tratar pois conta que pode apresentar vies, assim vamos retirar
df = df.dropna(subset=["age"])
df = df.dropna(subset=["sex"])

As colunas TSH, T3, TT4, T4U, TBG e FTI são informações reais, se as colunas MEASURED estão com valor 0 o exame não foi precisa realizar assim as colunas não tem informação, sendo assim são informações concretas.

In [34]:
# Criar o encoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Ajustar e transformar a coluna
encoded = encoder.fit_transform(df[["referral source"]])

# Recuperar nomes das novas colunas
encoded_cols = encoder.get_feature_names_out(["referral source"])

# Criar DataFrame com os nomes corretos
df_encoded = pd.DataFrame(encoded, columns=encoded_cols, index=df.index)

# Concatenar ao dataframe original
df = pd.concat([df.drop(columns=["referral source"]), df_encoded], axis=1)

In [35]:
df.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,T4U,FTI measured,FTI,TBG measured,binaryClass,referral source_STMW,referral source_SVHC,referral source_SVHD,referral source_SVI,referral source_other
0,41.0,0.0,0,0,0,0,0,0,0,0,...,1.14,1,109.0,0,,0.0,1.0,0.0,0.0,0.0
1,23.0,0.0,0,0,0,0,0,0,0,0,...,,0,,0,,0.0,0.0,0.0,0.0,1.0
2,46.0,1.0,0,0,0,0,0,0,0,0,...,0.91,1,120.0,0,,0.0,0.0,0.0,0.0,1.0
3,70.0,0.0,1,0,0,0,0,0,0,0,...,,0,,0,,0.0,0.0,0.0,0.0,1.0
4,70.0,0.0,0,0,0,0,0,0,0,0,...,0.87,1,70.0,0,,0.0,0.0,0.0,1.0,0.0
