In [1]:
# pip install lightgbm imbalanced-learn
# !pip install xgboost catboost

## **NOTE:** install lightgbm, imbalanced-learn, and xgboost python library into your ipynb notebook before running this. Else, you will encounter error.
*(You can uncomment the above py code block ONLY if you haven't installed it)*

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Loading Data
folder_path = os.getcwd()
train = pd.read_csv(os.path.join(folder_path, 'Train_Data.csv'))
test = pd.read_csv(os.path.join(folder_path, 'Test_Data.csv'))
sample_sub = pd.read_csv(os.path.join(folder_path, 'Sample_Submission.csv'))

# Encoding Target
train.dropna(subset=['age_group'], inplace=True)
le = LabelEncoder()
train['age_group_enc'] = le.fit_transform(train['age_group'])  # Adult=0, Senior=1

def engineer_features(df):
    df = df.copy()
    if 'BMXWT' in df.columns and 'BMXHT' in df.columns:
        df['BMI'] = df['BMXWT'] / ((df['BMXHT'] / 100) ** 2 + 1e-5)
    if 'BPXSY1' in df.columns and 'BPXDI1' in df.columns:
        df['PulsePressure'] = df['BPXSY1'] - df['BPXDI1']
    if 'LBXTC' in df.columns and 'LBDHDL' in df.columns:
        df['CholesterolRatio'] = df['LBXTC'] / (df['LBDHDL'] + 1e-5)
    return df

train = engineer_features(train)
test = engineer_features(test)

# Defining X and y
X = train.drop(['SEQN', 'age_group', 'age_group_enc'], axis=1)
y = train['age_group_enc']
X_test = test.drop(['SEQN'], axis=1)

# Drop low-variance columns
X = X.loc[:, X.nunique() > 1]
X_test = X_test[X.columns]

# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Pipeline with Imbalance Handling
pipeline = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('under', RandomUnderSampler(sampling_strategy=0.8, random_state=42)),
    ('smote', SMOTE(sampling_strategy=1.0, random_state=42)),
    ('clf', LGBMClassifier(random_state=42, class_weight='balanced'))
])

# Hyperparameter Space
param_dist = {
    'clf__n_estimators': [100, 200, 300],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__max_depth': [-1, 10, 20],
    'clf__num_leaves': [31, 50, 70],
    'clf__subsample': [0.7, 1.0],
    'clf__colsample_bytree': [0.7, 1.0]
}

# Random Search
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=25,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Train
search.fit(X_train, y_train)

# Evaluate
y_pred_val = search.predict(X_val)
val_acc = accuracy_score(y_val, y_pred_val)
val_f1 = f1_score(y_val, y_pred_val)

print("\n✅ Best Hyperparameters:", search.best_params_)
print("📊 Validation Accuracy:", round(val_acc * 100, 2), "%")
print("📊 Validation F1 Score:", round(val_f1 * 100, 2), "%")
print("\n📋 Classification Report:\n", classification_report(y_val, y_pred_val))

# Train on All Data
final_model = search.best_estimator_
final_model.fit(X, y)

# Predict on Test
test_preds = final_model.predict(X_test)
submission = sample_sub.copy()
submission['age_group'] = test_preds

# Save
output_path = os.path.join(folder_path, f'submission.csv')
submission.to_csv(output_path, index=False)
print(f"✅ Submission saved at: {output_path}")


Fitting 5 folds for each of 25 candidates, totalling 125 fits


[WinError 2] The system cannot find the file specified
  File "D:\Softwares\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "D:\Softwares\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Softwares\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "D:\Softwares\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


[LightGBM] [Info] Number of positive: 313, number of negative: 313
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 548
[LightGBM] [Info] Number of data points in the train set: 626, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

✅ Best Hyperparameters: {'clf__subsample': 0.7, 'clf__num_leaves': 31, 'clf__n_estimators': 100, 'clf__max_depth': 10, 'clf__learning_rate': 0.01, 'clf__colsample_bytree': 1.0}
📊 Validation Accuracy: 70.59 %
📊 Validation F1 Score: 35.75 %

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.74      0.81       328
           1       0.28      0.51      0.36        63

    accuracy                           0.71       391
   macro avg       0