In [39]:
import pandas as pd
import numpy as np
from scipy.stats import entropy as scipy_entropy

# ============================================================
# تحميل الداتا
# ============================================================

df = pd.read_csv('../data/Unified Dataset/unified_dataset_ready_for_imputation.csv', low_memory=False)
print("Shape original:", df.shape)
print("\nMissing values avant imputation:")
print(df.isnull().sum()[df.isnull().sum() > 0])

Shape original: (221756, 35)

Missing values avant imputation:
age                         4
sex                       303
country                  9065
ethnicity                9065
family_history           9065
iodine_deficiency        9065
diabetes                 9065
obesity                  9065
lithium                212691
psych                  212691
goitre                 212691
hypopituitary          212691
tumor                  212691
nodule_size              9065
diagnosis                9065
on_thyroxine           212691
on_antithyroid_meds    212691
i131_treatment         212691
tsh                       832
t3                       2578
tt4                    213130
t4                       9065
t4u                    213497
fti                    213490
tsh_measured           212691
t3_measured            212691
tt4_measured           212691
t4u_measured           212691
fti_measured           212691
query_hyperthyroid     212691
query_hypothyroid      212691
query_o

In [42]:
# ============================================================
# الخطوة 1: حذف الأعمدة غير المفيدة
# ============================================================
df = df.drop(columns=['on_antithyroid_meds'], errors='ignore')
print("✅ تم حذف on_antithyroid_meds")
print("Shape:", df.shape)

✅ تم حذف on_antithyroid_meds
Shape: (221756, 34)


In [43]:
# ============================================================
# الخطوة 2: Rule-Based Imputation (Laboratory Indicators)
# ============================================================
pairs = {
    'tsh_measured': 'tsh',
    't3_measured':  't3',
    'tt4_measured': 'tt4',
    't4u_measured': 't4u',
    'fti_measured': 'fti'
}

for indicator, value_col in pairs.items():
    if indicator in df.columns and value_col in df.columns:
        mask = df[indicator].isnull()
        df[indicator] = df[indicator].where(~mask | df[value_col].isnull(), 't')
        df[indicator] = df[indicator].where(~mask | df[value_col].notnull(), 'f')

print("✅ Rule-Based Imputation تمت")
print("Missing في الـ indicators:", df[list(pairs.keys())].isnull().sum().sum())

✅ Rule-Based Imputation تمت
Missing في الـ indicators: 0


In [44]:
# ============================================================
# الخطوة 3: Numerical Imputation
# ============================================================

# Symmetric (skewness ≈ 0) => Mean
num_mean = ['age', 'nodule_size', 'tsh', 't3', 't4']

# Skewed (skewness > 0.5) => Median
num_median = ['tt4', 't4u', 'fti']

for c in num_mean:
    if c in df.columns:
        df[c] = df[c].fillna(df[c].mean())
        print(f"  {c} => mean")

for c in num_median:
    if c in df.columns:
        df[c] = df[c].fillna(df[c].median())
        print(f"  {c} => median")

print("✅ Numerical Imputation تمت")
print("Missing:", df[num_mean + num_median].isnull().sum().sum())

  age => mean
  nodule_size => mean
  tsh => mean
  t3 => mean
  t4 => mean
  tt4 => median
  t4u => median
  fti => median
✅ Numerical Imputation تمت
Missing: 0


In [45]:
# ============================================================
# الخطوة 4: Categorical Imputation (Entropy-Based)
# ============================================================
ENTROPY_THRESHOLD = 0.5

cat_cols = [
    'sex', 'family_history', 'iodine_deficiency', 'diabetes', 'obesity',
    'lithium', 'psych', 'goitre', 'hypopituitary', 'tumor',
    'on_thyroxine', 'i131_treatment', 'tsh_measured', 't3_measured',
    'tt4_measured', 't4u_measured', 'fti_measured',
    'query_hyperthyroid', 'query_hypothyroid', 'query_on_thyroxine'
]

for c in cat_cols:
    if c not in df.columns or df[c].isnull().sum() == 0:
        continue
    vc = df[c].value_counts(normalize=True)
    e = scipy_entropy(vc)
    if e < ENTROPY_THRESHOLD:
        df[c] = df[c].fillna(df[c].mode()[0])
        print(f"  {c} => mode (entropy={e:.3f})")
    else:
        df[c] = df[c].fillna('Unknown')
        print(f"  {c} => 'Unknown' (entropy={e:.3f})")

# country, ethnicity, diagnosis => Unknown
for c in ['country', 'ethnicity', 'diagnosis']:
    if c in df.columns:
        df[c] = df[c].fillna('Unknown')
        print(f"  {c} => 'Unknown'")

print("✅ Categorical Imputation تمت")

  sex => 'Unknown' (entropy=0.672)
  family_history => 'Unknown' (entropy=0.611)
  iodine_deficiency => 'Unknown' (entropy=0.562)
  diabetes => 'Unknown' (entropy=0.501)
  obesity => 'Unknown' (entropy=0.611)
  lithium => mode (entropy=0.057)
  psych => mode (entropy=0.186)
  goitre => mode (entropy=0.051)
  hypopituitary => mode (entropy=0.002)
  tumor => mode (entropy=0.115)
  on_thyroxine => mode (entropy=0.395)
  i131_treatment => mode (entropy=0.093)
  query_hyperthyroid => mode (entropy=0.251)
  query_hypothyroid => mode (entropy=0.252)
  query_on_thyroxine => mode (entropy=0.084)
  country => 'Unknown'
  ethnicity => 'Unknown'
  diagnosis => 'Unknown'
✅ Categorical Imputation تمت


In [46]:
# ============================================================
# التحقق النهائي
# ============================================================
total_missing = df.isnull().sum().sum()
print(f"\n✅ Total Missing Values بعد الـ Imputation: {total_missing}")
print("Shape final:", df.shape)


✅ Total Missing Values بعد الـ Imputation: 0
Shape final: (221756, 34)


In [48]:
# ============================================================
# حفظ الداتا النظيفة
# ============================================================
df.to_csv('../data/Unified Dataset/unified_dataset_imputed.csv', index=False)
print("✅ تم الحفظ في unified_dataset_imputed.csv")

✅ تم الحفظ في unified_dataset_imputed.csv


In [49]:
df[["tsh","t3","tt4"]].describe()

Unnamed: 0,tsh,t3,tt4
count,221756.0,221756.0,221756.0
mean,4.942669,1.999037,104.147194
std,2.894838,0.855772,6.863783
min,0.12,0.52,16.0
25%,2.4,1.27,104.0
50%,4.92,1.999037,104.0
75%,7.45,2.73,104.0
max,9.93,3.47,225.71


In [50]:
# ============================================================
# Hybrid Rule-Based Target Definition
# Hyperthyroid (1) vs Non-Hyperthyroid (0)
# ============================================================

# الكودات الطبية للـ Hyperthyroid من عمود class
hyper_codes = ['A', 'B', 'C', 'D', 'AK', 'GK', 'MK', 'FK', 
               'KJ', 'GKJ', 'C|I', 'D|R']

def hybrid_target(row):
    
    # Rule 1: class يقول hyperthyroid => 1 مباشرة
    if row['class'] in hyper_codes:
        return 1
    
    # Rule 2: query_hyperthyroid = t => مشتبه به
    query_hyper = str(row.get('query_hyperthyroid', 'f')).strip().lower() == 't'
    
    # Rule 3: TSH منخفض جداً (أقوى مؤشر)
    tsh_low = pd.notna(row['tsh']) and row['tsh'] < 0.4
    
    # Rule 4: T3 مرتفع
    t3_high = pd.notna(row['t3']) and row['t3'] > 2.0
    
    # Rule 5: TT4 مرتفع
    tt4_high = pd.notna(row['tt4']) and row['tt4'] > 120
    
    # Rule 6: FTI مرتفع
    fti_high = pd.notna(row['fti']) and row['fti'] > 120

    # الـ Lab score
    lab_score = sum([tsh_low, t3_high, tt4_high, fti_high])
    
    # القرار النهائي
    if query_hyper and lab_score >= 1:
        return 1
    if tsh_low and lab_score >= 2:
        return 1
    if lab_score >= 3:
        return 1
    
    return 0

df['target'] = df.apply(hybrid_target, axis=1)

print("✅ توزيع الـ target:")
print(df['target'].value_counts())
print(f"\nHyperthyroid  (1): {df['target'].sum():,}")
print(f"Non-Hyperthyroid (0): {(df['target']==0).sum():,}")
print(f"نسبة الـ Hyperthyroid: {df['target'].mean()*100:.2f}%")

✅ توزيع الـ target:
target
0    216521
1      5235
Name: count, dtype: int64

Hyperthyroid  (1): 5,235
Non-Hyperthyroid (0): 216,521
نسبة الـ Hyperthyroid: 2.36%


In [51]:
import pandas as pd

df = pd.read_csv('../data/Unified Dataset/unified_dataset_ready_for_imputation.csv', low_memory=False)

# المتغيرات المختارة
selected_cols = [
    'age', 'sex', 'tsh', 't3', 'tt4', 't4', 'fti',
    'on_thyroxine', 'i131_treatment', 'query_hyperthyroid',
    'family_history', 'iodine_deficiency', 'goitre', 'tumor', 'diabetes'
]

df_selected = df[selected_cols]

print("=== Variables & Types ===\n")
print(df_selected.dtypes)

print("\n=== Sample Values ===\n")
print(df_selected.head(3).to_string())

print("\n=== Missing Values ===\n")
print(df_selected.isnull().sum())

=== Variables & Types ===

age                   float64
sex                    object
tsh                   float64
t3                    float64
tt4                   float64
t4                    float64
fti                   float64
on_thyroxine           object
i131_treatment         object
query_hyperthyroid     object
family_history         object
iodine_deficiency      object
goitre                 object
tumor                  object
diabetes               object
dtype: object

=== Sample Values ===

    age sex  tsh   t3    tt4  t4  fti on_thyroxine i131_treatment query_hyperthyroid family_history iodine_deficiency goitre tumor diabetes
0  29.0   F  0.3  NaN    NaN NaN  NaN            f              f                  f            NaN               NaN      f     f      NaN
1  29.0   F  1.6  1.9  128.0 NaN  NaN            f              f                  f            NaN               NaN      f     f      NaN
2  41.0   F  NaN  NaN    NaN NaN  NaN            f              f