In [28]:
import pandas as pd
import numpy as np
from scipy.stats import entropy as scipy_entropy

# ============================================================
# تحميل الداتا
# ============================================================

df = pd.read_csv('../data/Unified Dataset/unified_dataset_ready_for_imputation.csv', low_memory=False)
print("Shape original:", df.shape)
print("\nMissing values avant imputation:")
print(df.isnull().sum()[df.isnull().sum() > 0])

Shape original: (221756, 35)

Missing values avant imputation:
age                         4
sex                       303
country                  9065
ethnicity                9065
family_history           9065
iodine_deficiency        9065
diabetes                 9065
obesity                  9065
lithium                212691
psych                  212691
goitre                 212691
hypopituitary          212691
tumor                  212691
nodule_size              9065
diagnosis                9065
on_thyroxine           212691
on_antithyroid_meds    212691
i131_treatment         212691
tsh                       832
t3                       2578
tt4                    213130
t4                       9065
t4u                    213497
fti                    213490
tsh_measured           212691
t3_measured            212691
tt4_measured           212691
t4u_measured           212691
fti_measured           212691
query_hyperthyroid     212691
query_hypothyroid      212691
query_o

In [29]:
lab_pairs = [
    ("tsh", "tsh_measured"),
    ("t3", "t3_measured"),
    ("tt4", "tt4_measured"),
    ("t4u", "t4u_measured"),
    ("fti", "fti_measured"),
]

for lab, indicator in lab_pairs:
    print(f"\nChecking {lab} vs {indicator}")
    
    inconsistent_1 = ((df[lab].notnull()) & (df[indicator] == "No")).sum()
    inconsistent_2 = ((df[lab].isnull()) & (df[indicator] == "Yes")).sum()
    
    print("Value present but indicator = No :", inconsistent_1)
    print("Value missing but indicator = Yes:", inconsistent_2)


Checking tsh vs tsh_measured
Value present but indicator = No : 0
Value missing but indicator = Yes: 0

Checking t3 vs t3_measured
Value present but indicator = No : 0
Value missing but indicator = Yes: 0

Checking tt4 vs tt4_measured
Value present but indicator = No : 0
Value missing but indicator = Yes: 0

Checking t4u vs t4u_measured
Value present but indicator = No : 0
Value missing but indicator = Yes: 0

Checking fti vs fti_measured
Value present but indicator = No : 0
Value missing but indicator = Yes: 0


In [30]:
lab_pairs = [
    ("tsh", "tsh_measured"),
    ("t3", "t3_measured"),
    ("tt4", "tt4_measured"),
    ("t4u", "t4u_measured"),
    ("fti", "fti_measured"),
]

for lab, indicator in lab_pairs:
    df[indicator] = df[lab].notnull().map({True: "Yes", False: "No"})

In [31]:
for lab, indicator in lab_pairs:
    inconsistent = ((df[lab].notnull()) & (df[indicator] == "No")).sum()
    inconsistent += ((df[lab].isnull()) & (df[indicator] == "Yes")).sum()
    print(lab, "Inconsistencies:", inconsistent)

tsh Inconsistencies: 0
t3 Inconsistencies: 0
tt4 Inconsistencies: 0
t4u Inconsistencies: 0
fti Inconsistencies: 0


In [32]:
df["tsh_measured"].equals(df["tsh"].notnull())

False

In [33]:
df["tsh_measured"].map({"Yes": True, "No": False}).equals(df["tsh"].notnull())

True

In [34]:
print(df["nodule_size"].describe())


count    212691.000000
mean          2.503403
std           1.444631
min           0.000000
25%           1.250000
50%           2.510000
75%           3.760000
max           5.000000
Name: nodule_size, dtype: float64


In [35]:
print(df["diagnosis"].describe())

count     212691
unique         2
top       Benign
freq      163196
Name: diagnosis, dtype: object


In [36]:
print(df["diagnosis"].value_counts())

diagnosis
Benign       163196
Malignant     49495
Name: count, dtype: int64


In [37]:
print(df[["t4", "tt4"]].corr())

      t4  tt4
t4   1.0  NaN
tt4  NaN  1.0


In [38]:
print("tt4" in df.columns)

True


In [39]:
df[["t4","fti","t4u"]].corr()

Unnamed: 0,t4,fti,t4u
t4,1.0,,
fti,,1.0,-0.234737
t4u,,-0.234737,1.0


In [40]:
print(df["t4u"].isna().mean() * 100)

96.27563628492577


In [41]:
from sklearn.impute import SimpleImputer

age_imputer = SimpleImputer(strategy="median")
df["age"] = age_imputer.fit_transform(df[["age"]])

In [42]:
tsh_imputer = SimpleImputer(strategy="median")
df["tsh"] = tsh_imputer.fit_transform(df[["tsh"]])

In [43]:
df["sex"] = df["sex"].fillna("Unknown")

In [44]:
df["sex"].value_counts(dropna=False)

sex
F          133497
M           87956
Unknown       303
Name: count, dtype: int64

In [45]:
num_cols = ["t3", "t4", "nodule_size"]

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

In [46]:
import numpy as np

def compute_entropy(series):
    probs = series.value_counts(normalize=True)
    return -np.sum(probs * np.log2(probs))

In [47]:
import numpy as np

def compute_entropy(series):
    probs = series.value_counts(normalize=True)
    return -np.sum(probs * np.log2(probs))

In [48]:
def normalized_entropy(series):
    probs = series.value_counts(normalize=True)
    entropy = -np.sum(probs * np.log2(probs))
    max_entropy = np.log2(len(probs))
    return entropy / max_entropy if max_entropy != 0 else 0

In [49]:
cat_cols = [
    "country", "ethnicity", "family_history",
    "iodine_deficiency", "diabetes",
    "obesity", "diagnosis"
]

for col in cat_cols:
    ent = normalized_entropy(df[col].dropna())
    
    print(f"{col} -> Normalized Entropy: {ent:.3f}")
    
    if ent < 0.5:
        # Dominated distribution
        df[col] = df[col].fillna(df[col].mode()[0])
        print(" → Mode imputation applied")
    else:
        # Balanced distribution
        df[col] = df[col].fillna("Unknown")
        print(" → 'Unknown' imputation applied")

country -> Normalized Entropy: 0.950
 → 'Unknown' imputation applied
ethnicity -> Normalized Entropy: 0.960
 → 'Unknown' imputation applied
family_history -> Normalized Entropy: 0.881
 → 'Unknown' imputation applied
iodine_deficiency -> Normalized Entropy: 0.810
 → 'Unknown' imputation applied
diabetes -> Normalized Entropy: 0.722
 → 'Unknown' imputation applied
obesity -> Normalized Entropy: 0.882
 → 'Unknown' imputation applied
diagnosis -> Normalized Entropy: 0.783
 → 'Unknown' imputation applied


In [50]:

print("tt4" in df.columns)

True


In [51]:
for col in ["tt4","t4u","fti"]:
    print(col, df[col].skew())

tt4 0.6024426270926735
t4u 0.6788518600493854
fti 0.6178756738735679


In [52]:
for col in ["tt4","t4u","fti"]:
    skew = df[col].skew()
    
    if -1 < skew < 1:
        df[col] = df[col].fillna(df[col].mean())
    else:
        df[col] = df[col].fillna(df[col].median())

In [53]:
import numpy as np

def normalized_entropy(series):
    probs = series.value_counts(normalize=True)
    entropy = -np.sum(probs * np.log2(probs))
    max_entropy = np.log2(len(probs))
    return entropy / max_entropy if max_entropy != 0 else 0

In [54]:
cat_cols = [
    "lithium","psych","goitre","hypopituitary",
    "tumor","on_thyroxine","on_antithyroid_meds",
    "i131_treatment","query_hyperthyroid",
    "query_hypothyroid","query_on_thyroxine"
]

for col in cat_cols:
    ent = normalized_entropy(df[col].dropna())
    
    if ent < 0.5:
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna("Unknown")

In [56]:
df.to_csv("../data/Unified Dataset/imputed.csv", index=False)

In [62]:
import pandas as pd
pd.set_option('display.max_rows', None)

In [66]:
cols_to_check = [
    "lithium","psych","goitre","hypopituitary","tumor",
    "on_thyroxine","on_antithyroid_meds","i131_treatment",
    "tt4","t4u","fti",
    "query_hyperthyroid","query_hypothyroid","query_on_thyroxine"
]

for col in cols_to_check:
    print("\n==============================")
    print("Variable:", col)
    
    if df[col].dtype == "object":
        print("Type: Categorical")
        print(df[col].value_counts(dropna=False))
    else:
        print("Type: Numeric")
        print("NaN remaining:", df[col].isnull().sum())
        print("Min:", df[col].min())
        print("Max:", df[col].max())
        print("Mean:", df[col].mean())
  


Variable: lithium
Type: Categorical
lithium
f    221663
t        93
Name: count, dtype: int64

Variable: psych
Type: Categorical
psych
f    221340
t       416
Name: count, dtype: int64

Variable: goitre
Type: Categorical
goitre
f    221676
t        80
Name: count, dtype: int64

Variable: hypopituitary
Type: Categorical
hypopituitary
f    221754
t         2
Name: count, dtype: int64

Variable: tumor
Type: Categorical
tumor
f    221534
t       222
Name: count, dtype: int64

Variable: on_thyroxine
Type: Categorical
on_thyroxine
Unknown    212691
f            7843
t            1222
Name: count, dtype: int64

Variable: on_antithyroid_meds
Type: Categorical
on_antithyroid_meds
f    221644
t       112
Name: count, dtype: int64

Variable: i131_treatment
Type: Categorical
i131_treatment
f    221587
t       169
Name: count, dtype: int64

Variable: tt4
Type: Numeric
NaN remaining: 0
Min: 16.0
Max: 225.7099999999992
Mean: 107.78403199629021

Variable: t4u
Type: Numeric
NaN remaining: 0
Min: 0.17


In [72]:
import pandas as pd

# 1) Target from class (official label)
# Hyperthyroid classes usually include A, B, C, D (adjust if your mapping is different)
hyper_codes = {"A", "B", "C", "D"}

df["target_class"] = df["class"].astype(str).apply(
    lambda x: 1 if any(code in x for code in hyper_codes) else 0
)

# 2) Target from scientific rule (biochemical rule)
# Example clinical thresholds (adjust if needed depending on your lab units)
TSH_LOW = 0.4
T3_HIGH = 2.5
T4_HIGH = 12

df["target_rule"] = (
    (df["tsh"] < TSH_LOW) &
    ((df["t3"] > T3_HIGH) | (df["t4"] > T4_HIGH))
).astype(int)

# 3) Hybrid target (use both at the same time)
# If either class OR rule indicates hyperthyroid -> hyper (1)
df["target_hybrid"] = ((df["target_class"] == 1) | (df["target_rule"] == 1)).astype(int)

# 4) Quick checks
print("target_class distribution:\n", df["target_class"].value_counts(), "\n")
print("target_rule distribution:\n", df["target_rule"].value_counts(), "\n")
print("target_hybrid distribution:\n", df["target_hybrid"].value_counts(), "\n")

print("Agreement table (class vs rule):")
print(pd.crosstab(df["target_class"], df["target_rule"]))

# Optional: drop the original label column to avoid leakage
# df = df.drop(columns=["class"])

target_class distribution:
 target_class
0    221525
1       231
Name: count, dtype: int64 

target_rule distribution:
 target_rule
0    219212
1      2544
Name: count, dtype: int64 

target_hybrid distribution:
 target_hybrid
0    219106
1      2650
Name: count, dtype: int64 

Agreement table (class vs rule):
target_rule        0     1
target_class              
0             219106  2419
1                106   125


In [73]:
df.to_csv("../data/Unified Dataset/final_dataset_ready.csv", index=False)