# YN Μέρος Β

## Β1

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping

  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [2]:
CSV_PATH = "alzheimers_disease_data.csv"  
TARGET = "Diagnosis"                      

df = pd.read_csv(CSV_PATH)

ID_LIKE_COLS = ["PatientID", "DoctorInCharge", "SubjectID", "MRI_ID", "ID"]
for col in ID_LIKE_COLS:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)


if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found. Update TARGET to your dataset.")
y = df[TARGET].astype(int)
X = df.drop(columns=[TARGET])

cat_cols = list(X.select_dtypes(include=["object", "category", "bool"]).columns)
num_cols = [c for c in X.columns if c not in cat_cols]

In [3]:
X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)


In [5]:
numeric_pipe = Pipeline([('scaler', StandardScaler())])
cat_pipe = Pipeline([('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ],
    remainder='drop'
)

# Fit on TRAIN only (no leakage), then transform both
X_train_tr = preprocessor.fit_transform(X_train_raw)
X_val_tr   = preprocessor.transform(X_val_raw)

X_train_tr = np.asarray(X_train_tr)
X_val_tr   = np.asarray(X_val_tr)

input_dim = X_train_tr.shape[1]

In [6]:
def build_best_model(input_dim):
    #A5 best was [I, I] with ReLU + SGD(lr=0.1, m=0.6)
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(input_dim, activation='relu'))  
    model.add(Dense(input_dim, activation='relu'))  
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=SGD(learning_rate=0.1, momentum=0.6),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

model = build_best_model(input_dim)

early = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
_ = model.fit(
    X_train_tr, y_train,
    validation_data=(X_val_tr, y_val),
    epochs=200, verbose=0, callbacks=[early]
)

In [21]:
def build_column_groups(preprocessor, num_cols, cat_cols):
    try:
        names = preprocessor.get_feature_names_out()
    except AttributeError:
        raise RuntimeError(
            "preprocessor.get_feature_names_out() not available. "
            "Ensure scikit-learn >= 1.0 and that preprocessor is FIT."
        )
    column_groups = []
    feature_names = []

    def idx_where(pred):
        return np.array([i for i, nm in enumerate(names) if pred(nm)], dtype=int)

    for feat in num_cols:
        idx = idx_where(lambda nm, f=feat: nm == f"num__{f}")
        if idx.size == 0:
            idx = idx_where(lambda nm, f=feat: nm.startswith("num__") and nm.endswith(f))
        column_groups.append(idx)
        feature_names.append(feat)

    for feat in cat_cols:
        idx = idx_where(lambda nm, f=feat: nm.startswith(f"cat__{f}_") or nm == f"cat__{f}")
        column_groups.append(idx)  
        feature_names.append(feat)

    return column_groups, feature_names

In [22]:
column_groups, original_feature_names = build_column_groups(preprocessor, num_cols, cat_cols)
n_features = len(column_groups)
print(f"Original features for GA (after A1 cleaning): {n_features}")

for name, grp in zip(original_feature_names, column_groups):
     print(f"{name}: transformed cols {grp}")

Original features for GA (after A1 cleaning): 32
Age: transformed cols [0]
Gender: transformed cols [1]
Ethnicity: transformed cols [2]
EducationLevel: transformed cols [3]
BMI: transformed cols [4]
Smoking: transformed cols [5]
AlcoholConsumption: transformed cols [6]
PhysicalActivity: transformed cols [7]
DietQuality: transformed cols [8]
SleepQuality: transformed cols [9]
FamilyHistoryAlzheimers: transformed cols [10]
CardiovascularDisease: transformed cols [11]
Diabetes: transformed cols [12]
Depression: transformed cols [13]
HeadInjury: transformed cols [14]
Hypertension: transformed cols [15]
SystolicBP: transformed cols [16]
DiastolicBP: transformed cols [17]
CholesterolTotal: transformed cols [18]
CholesterolLDL: transformed cols [19]
CholesterolHDL: transformed cols [20]
CholesterolTriglycerides: transformed cols [21]
MMSE: transformed cols [22]
FunctionalAssessment: transformed cols [23]
MemoryComplaints: transformed cols [24]
BehavioralProblems: transformed cols [25]
ADL: tr

### B1α, Β1β

In [23]:
rng = np.random.default_rng(42)

def create_chromosome(n_features):
    # binary vector with at least one '1'
    while True:
        ch = rng.integers(0, 2, size=n_features, dtype=int)
        if ch.sum() > 0:
            return ch

def initialize_population(pop_size, n_features):
    return np.vstack([create_chromosome(n_features) for _ in range(pop_size)])

### Β1γ

In [24]:
def apply_feature_mask(X_tr, chromosome, column_groups):

    X_masked = X_tr.copy()
    for gene, cols in zip(chromosome, column_groups):
        if gene == 0 and cols.size > 0:
            X_masked[:, cols] = 0.0
    return X_masked

def evaluate_chromosome(model, X_val_tr, y_val, chromosome, column_groups):
   
    if chromosome.sum() == 0:
        return np.inf, 0.0
    X_masked = apply_feature_mask(X_val_tr, chromosome, column_groups)
    y_prob = model.predict(X_masked, verbose=0).ravel()
    ce = log_loss(y_val, y_prob, labels=[0, 1])
    acc = accuracy_score(y_val, (y_prob >= 0.5).astype(int))
    return ce, acc

def chromosome_fitness(chromosome, lam=0.05):
    
    ce, acc = evaluate_chromosome(model, X_val_tr, y_val, chromosome, column_groups)
    k = int(chromosome.sum())
    n = len(chromosome)
    fitness = ce + lam * (k / n)
    return fitness, {"CE": ce, "Acc": acc, "k": k}

In [25]:
demo_ch = create_chromosome(n_features)
fit_val, meta = chromosome_fitness(demo_ch, lam=0.05)
print("Demo chromosome:",
      {"fitness": round(fit_val, 4), "CE": round(meta["CE"], 4),
       "Acc": round(meta["Acc"], 4), "k": meta["k"]})

Demo chromosome: {'fitness': 0.6436, 'CE': 0.6155, 'Acc': 0.7268, 'k': 18}
