In [49]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data_set_path = '../data/health_lifestyle_dataset_cleaned.csv'
df = pd.read_csv('../data/health_lifestyle_dataset.csv')



In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 100000 non-null  int64  
 1   age                100000 non-null  int64  
 2   gender             100000 non-null  object 
 3   bmi                100000 non-null  float64
 4   daily_steps        100000 non-null  int64  
 5   sleep_hours        100000 non-null  float64
 6   water_intake_l     100000 non-null  float64
 7   calories_consumed  100000 non-null  int64  
 8   smoker             100000 non-null  int64  
 9   alcohol            100000 non-null  int64  
 10  resting_hr         100000 non-null  int64  
 11  systolic_bp        100000 non-null  int64  
 12  diastolic_bp       100000 non-null  int64  
 13  cholesterol        100000 non-null  int64  
 14  family_history     100000 non-null  int64  
 15  disease_risk       100000 non-null  int64  
dtypes: 

In [23]:
df.head()
df.describe()
df.isnull().sum()

id                   0
age                  0
gender               0
bmi                  0
daily_steps          0
sleep_hours          0
water_intake_l       0
calories_consumed    0
smoker               0
alcohol              0
resting_hr           0
systolic_bp          0
diastolic_bp         0
cholesterol          0
family_history       0
disease_risk         0
dtype: int64

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [43]:


def preprocess_data(df, target):

    if target not in df.columns:
        raise ValueError(f"Target column '{target}' not found in dataset.")

    X = df.drop(columns=[target])
    y = df[target]

    return X, y


In [None]:

def preprocess(X: pd.DataFrame, y: pd.Series):
    """
    Split the dataset into training/testing sets (80/20) and standardize features.

    Returns
    -------
    X_train_scaled, X_test_scaled, y_train, y_test, scaler
    """

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        stratify=y,           # important for classification balance
        random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler


In [50]:
def train_mlp(
    X_train: np.ndarray,
    y_train: np.ndarray,
    hidden_layer_sizes=(64, 32),
    learning_rate_init=1e-3,
    batch_size=128,
    max_iter=80
):
    """
    Train a Multilayer Perceptron using Scikit-learn.

    Justifications (for defense):
    - Hidden layers : (64, 32) → ≤ 100 neurons each.
    - ReLU activation → avoids vanishing gradients.
    - Adam solver → robust for medium/large tabular datasets.
    - Adaptive learning rate → improves convergence.
    - Early stopping → avoids overfitting.
    """

    model = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation="relu",
        solver="adam",
        batch_size=batch_size,
        learning_rate_init=learning_rate_init,
        learning_rate="adaptive",
        early_stopping=True,
        max_iter=max_iter,
        random_state=42,
        verbose=True
    )

    start = time.time()
    model.fit(X_train, y_train)
    duration = time.time() - start

    print(f"\n Training time: {duration:.2f} seconds")

    return model, duration



In [51]:
def evaluate(model, X_train, y_train, X_test, y_test):

  

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print("\n========== TRAIN METRICS ==========")
    print("Accuracy:", accuracy_score(y_train, y_pred_train))

    print("\n========== TEST METRICS ==========")
    print("Accuracy:", accuracy_score(y_test, y_pred_test))

    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred_test))

    print("\nConfusion Matrix:\n")
    print(confusion_matrix(y_test, y_pred_test))



In [52]:
def main():
    dataset_path = "health_lifestyle_dataset_cleaned.csv"
    target_column = "Diseases_risk"

    # (1) Load dataset
    X, y = load_dataset(dataset_path, target_column)

    print(f"Dataset loaded: {X.shape[0]} samples, {X.shape[1]} features.")

    # (2) Preprocess
    X_train, X_test, y_train, y_test, scaler = preprocess(X, y)

    # (3) Train Scikit-learn MLP
    model, train_time = train_mlp(
        X_train,
        y_train,
        hidden_layer_sizes=(64, 32),
        learning_rate_init=1e-3,
        batch_size=256,
        max_iter=100
    )

    # (4) Evaluate
    evaluate(model, X_train, y_train, X_test, y_test)


