- make this a py file
- merge with other files and incorporate into entire framework


In [None]:
print("HELLO ALL")

THIS IS A TEST


In [1]:
# Set global random seed variable for reproducibility
SEED = 42
import random
import numpy as np
random.seed(SEED)
np.random.seed(SEED)

In [2]:
import pandas as pd

# Scikit-learn utilities
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

# Base models
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC

# Additional diverse models for tabular data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# For randomness in hyperparameter distributions and for ensemble uncertainty
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from scipy.stats import mode

In [3]:
# ------------------------------------------------------------------------------
# 1. Load pre-split datasets
# ------------------------------------------------------------------------------
# We assume you have three CSV files with the final splits:
#   - train_models_final.csv
#   - val_models_final.csv
#   - test_models_final.csv
#
# Each CSV contains many columns including text fields, metadata, etc.
# We will use only the following features (see below) and the target column "label_label".

train_df = pd.read_csv("train_models_final.csv")
val_df   = pd.read_csv("val_models_final.csv")
test_df  = pd.read_csv("test_models_final.csv")

In [6]:
# ------------------------------------------------------------------------------
# 2. Define feature lists and target
# ------------------------------------------------------------------------------
# Provided lists of features
categorical_features = [
    "gender", "country", "country_code", "nationality", "country_of_domicile",
    "marital_status", "inheritance_details_relationship", "investment_risk_profile",
    "investment_horizon", "investment_experience", "type_of_mandate", "currency"
]

numerical_features = [
    "aum_savings", "aum_inheritance", "aum_real_estate_value"
]

# Define the target column (assumed to be in "label_label")
target_col = "label_label"

# Convert target to binary: 1 if "Reject", 0 if "Accept" (adjust as needed)
train_df[target_col] = (train_df[target_col] == "Reject").astype(int)
val_df[target_col]   = (val_df[target_col]   == "Reject").astype(int)
test_df[target_col]  = (test_df[target_col]  == "Reject").astype(int)

# For modeling, select only the features we want
feature_columns = numerical_features + categorical_features

X_train = train_df[feature_columns]
y_train = train_df[target_col]

X_val = val_df[feature_columns]
y_val = val_df[target_col]

X_test = test_df[feature_columns]
y_test = test_df[target_col]

In [7]:
# ------------------------------------------------------------------------------
# 3. Build preprocessing pipelines for numerical and categorical features
# ------------------------------------------------------------------------------
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",  OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features),
])

In [8]:
# ------------------------------------------------------------------------------
# 4. Define candidate models (diverse and high-performing on tabular data)
# ------------------------------------------------------------------------------
models = {
    "RandomForest":    RandomForestClassifier(random_state=SEED),
    "LightGBM":        LGBMClassifier(random_state=SEED),
    "Lasso":           LogisticRegression(penalty='l1', solver='liblinear', random_state=SEED),
    "Ridge":           RidgeClassifier(random_state=SEED),
    "SVM":             SVC(probability=True, random_state=SEED),
    "KNN":             KNeighborsClassifier(),
    "GradientBoosting":GradientBoostingClassifier(random_state=SEED),
    "AdaBoost":        AdaBoostClassifier(random_state=SEED),
    "ExtraTrees":      ExtraTreesClassifier(random_state=SEED),
    "XGBoost":         XGBClassifier(random_state=SEED, use_label_encoder=False, eval_metric='logloss')
}

In [10]:
# ------------------------------------------------------------------------------
# 5. Define hyperparameter distributions for RandomizedSearchCV for each model
# ------------------------------------------------------------------------------
param_distributions = {
    "RandomForest": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__max_depth":    sp_randint(2, 15),
        "classifier__min_samples_split": sp_randint(2, 10),
    },
    "LightGBM": {
        "classifier__num_leaves": sp_randint(20, 50),
        "classifier__learning_rate": uniform(0.01, 0.3),
        "classifier__max_depth": sp_randint(2, 15),
    },
    "Lasso": {
        "classifier__C": uniform(0.001, 10.0),
    },
    "Ridge": {
        "classifier__alpha": uniform(0.001, 10.0),
    },
    "SVM": {
        "classifier__C": uniform(0.001, 10.0),
        "classifier__kernel": ["linear", "rbf"],
    },
    "KNN": {
        "classifier__n_neighbors": sp_randint(3, 20),
        "classifier__weights": ["uniform", "distance"],
        "classifier__p": [1, 2]
    },
    "GradientBoosting": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__learning_rate": uniform(0.01, 0.3),
        "classifier__max_depth": sp_randint(2, 10),
    },
    "AdaBoost": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__learning_rate": uniform(0.01, 1.0),
    },
    "ExtraTrees": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__max_depth": sp_randint(2, 15),
        "classifier__min_samples_split": sp_randint(2, 10),
    },
    "XGBoost": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__learning_rate": uniform(0.01, 0.3),
        "classifier__max_depth": sp_randint(2, 10),
    }
}

# Use K-fold cross-validation on the training set
kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)

In [11]:
# ------------------------------------------------------------------------------
# 6. Train models with hyperparameter search and evaluate on validation set
# ------------------------------------------------------------------------------
results = []

for model_name, model in models.items():
    print(f"\n\n=== Model: {model_name} ===")
    
    # Build pipeline: preprocessing + model
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    # Get hyperparameter distribution for this model
    search_params = param_distributions.get(model_name, {})
    
    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=search_params,
        n_iter=10,  # number of parameter settings sampled
        scoring="accuracy",
        cv=kfold,
        random_state=SEED,
        verbose=1,
        n_jobs=-1  # use all available cores
    )
    
    random_search.fit(X_train, y_train)
    
    best_estimator = random_search.best_estimator_
    print(f"Best params: {random_search.best_params_}")
    print(f"CV best score (train set): {random_search.best_score_:.4f}")
    
    # Evaluate on validation set
    y_val_pred = best_estimator.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    cm_val = confusion_matrix(y_val, y_val_pred)
    
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print("Validation Confusion Matrix:")
    print(cm_val)
    
    results.append({
        "model": model_name,
        "best_params": random_search.best_params_,
        "cv_best_score": random_search.best_score_,
        "val_accuracy": val_accuracy,
        "val_confusion_matrix": cm_val
    })



=== Model: RandomForest ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'classifier__max_depth': 8, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 142}
CV best score (train set): 0.5674
Validation Accuracy: 0.5680
Validation Confusion Matrix:
[[246  14]
 [202  38]]


=== Model: LightGBM ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




[LightGBM] [Info] Number of positive: 3615, number of negative: 3585[LightGBM] [Info] Number of positive: 3560, number of negative: 3640

[LightGBM] [Info] Number of positive: 3553, number of negative: 3647
[LightGBM] [Info] Number of positive: 3600, number of negative: 3600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

[LightGBM] [Info] Total Bins 903[LightGBM] [Info] Total Bins 905

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `fo



[LightGBM] [Info] Number of positive: 3560, number of negative: 3640




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73

[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] Number of positive: 3560, number of negative: 3640

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] Number of positive: 3615, number of negative: 3585

[LightGBM] [Info] Number of positive: 3553, number of negative: 3647
[LightGBM] [Info] Auto-choos



[LightGBM] [Info] Number of positive: 3600, number of negative: 3600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 899
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3560, number of negative: 3640




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223

[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Number of positive: 3553, number of negative: 3647
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0



[LightGBM] [Info] Number of positive: 3615, number of negative: 3585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502083 -> initscore=0.008333
[LightGBM] [Info] Start training from score 0.008333
[LightGBM] [Info] Number of positive: 3600, number of negative: 3600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 899
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 70
[LightGBM] [Info] [binary:Bo



[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223

[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.055078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM







[LightGBM] [Info] Number of positive: 3600, number of negative: 3600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 899
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3560, number of negative: 3640




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.077487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Inf



[LightGBM] [Info] Number of positive: 3553, number of negative: 3647
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003024 seconds.
You can set `force_row_wise=true` to remove the overhead.

[LightGBM] [Info] Number of positive: 3615, number of negative: 3585
[LightGBM] [Info] Total Bins 903
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 72
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493472 -> initscore=-0.026113
[LightGBM] [Info] Start training from score -0.026113

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004935 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 71

[LightGBM] [Info] Start training from score 0.008333
[LightGBM] [Info] Number of positi




[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005341 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223




[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Number of positive: 3553, number of negative: 3647
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018737 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 903
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 72
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493472 -> initscore=-0.026113
[LightGBM]



[LightGBM] [Info] Number of positive: 3600, number of negative: 3600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005335 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 899
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] Number of positive: 3615, number of negative: 3585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.052955 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502083 -> initscore=0.008333
[LightGBM] [Info



[LightGBM] [Info] Number of positive: 3600, number of negative: 3600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 899
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001935 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] Nu



[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493472 -> initscore=-0.026113
[LightGBM] [Info] Start training from score -0.026113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037937 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 901
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502083 -> initscore=0.008333
[LightGBM] [Info] Start training from score 0.008333
[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Number of positive: 3600, number of negative: 3600




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 905
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 73
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494444 -> initscore=-0.022223
[LightGBM] [Info] Total Bins 899
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 70
[LightGBM] [Info] Start training from score -0.022223
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 3560, number of negative: 3640
[LightGBM] [Info] Number of positive: 3615, number of negative: 3585
[LightGBM] 








[LightGBM] [Info] Number of positive: 4472, number of negative: 4528
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 909
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496889 -> initscore=-0.012445
[LightGBM] [Info] Start training from score -0.012445




Best params: {'classifier__learning_rate': np.float64(0.1012726728878613), 'classifier__max_depth': 7, 'classifier__num_leaves': 48}
CV best score (train set): 0.5650
Validation Accuracy: 0.5720
Validation Confusion Matrix:
[[171  89]
 [125 115]]


=== Model: Lasso ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'classifier__C': np.float64(3.746401188473625)}
CV best score (train set): 0.5828
Validation Accuracy: 0.5780
Validation Confusion Matrix:
[[234  26]
 [185  55]]


=== Model: Ridge ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'classifier__alpha': np.float64(0.5818361216819946)}
CV best score (train set): 0.5834
Validation Accuracy: 0.5840
Validation Confusion Matrix:
[[240  20]
 [188  52]]


=== Model: SVM ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'classifier__C': np.float64(5.987584841970366), 'classifier__kernel': 'linear'}
CV best score (train set): 0.5873




Validation Accuracy: 0.5900
Validation Confusion Matrix:
[[260   0]
 [205  35]]


=== Model: KNN ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'classifier__n_neighbors': 14, 'classifier__p': 1, 'classifier__weights': 'uniform'}
CV best score (train set): 0.5148
Validation Accuracy: 0.5440
Validation Confusion Matrix:
[[191  69]
 [159  81]]


=== Model: GradientBoosting ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'classifier__learning_rate': np.float64(0.14777466758976016), 'classifier__max_depth': 6, 'classifier__n_estimators': 149}
CV best score (train set): 0.5836
Validation Accuracy: 0.5820
Validation Confusion Matrix:
[[202  58]
 [151  89]]


=== Model: AdaBoost ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'classifier__learning_rate': np.float64(0.6086584841970366), 'classifier__n_estimators': 152}
CV best score (train set): 0.5167
Validation Accuracy: 0.5260
Validation Confusion Matrix:
[[260   0]
 [237   3]]


=== Model: ExtraTrees ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'classifier__max_depth': 12, 'classifier__min_samples_split': 9, 'classifier__n_estimators': 166}
CV best score (train set): 0.5716
Validation Accuracy: 0.5780
Validation Confusion Matrix:
[[258   2]
 [209  31]]


=== Model: XGBoost ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best params: {'classifier__learning_rate': np.float64(0.12236203565420874), 'classifier__max_depth': 6, 'classifier__n_estimators': 64}
CV best score (train set): 0.5790
Validation Accuracy: 0.5520
Validation Confusion Matrix:
[[221  39]
 [185  55]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [12]:
# ------------------------------------------------------------------------------
# 7. Summarize validation results
# ------------------------------------------------------------------------------
results_df = pd.DataFrame(results)
print("\n=== Validation Results Summary ===")
print(results_df[["model", "cv_best_score", "val_accuracy", "best_params"]])


=== Validation Results Summary ===
              model  cv_best_score  val_accuracy  \
0      RandomForest       0.567444         0.568   
1          LightGBM       0.565000         0.572   
2             Lasso       0.582778         0.578   
3             Ridge       0.583444         0.584   
4               SVM       0.587333         0.590   
5               KNN       0.514778         0.544   
6  GradientBoosting       0.583556         0.582   
7          AdaBoost       0.516667         0.526   
8        ExtraTrees       0.571556         0.578   
9           XGBoost       0.579000         0.552   

                                         best_params  
0  {'classifier__max_depth': 8, 'classifier__min_...  
1  {'classifier__learning_rate': 0.10127267288786...  
2               {'classifier__C': 3.746401188473625}  
3          {'classifier__alpha': 0.5818361216819946}  
4  {'classifier__C': 5.987584841970366, 'classifi...  
5  {'classifier__n_neighbors': 14, 'classifier__p...  
6  {'c

In [13]:
# ------------------------------------------------------------------------------
# 8. Final evaluation on the test set for each model
# ------------------------------------------------------------------------------
print("\n=== Test Evaluation ===")

# Store final pipelines in a dictionary for ensembling later
final_models = {}

for model_name, model in models.items():
    # Rebuild pipeline with best hyperparameters for the model
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    best_params = next((res["best_params"] for res in results if res["model"] == model_name), {})
    pipeline.set_params(**best_params)
    
    pipeline.fit(X_train, y_train)
    final_models[model_name] = pipeline
    
    y_test_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_cm = confusion_matrix(y_test, y_test_pred)
    
    print(f"\nModel: {model_name}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print("Test Confusion Matrix:")
    print(test_cm)


=== Test Evaluation ===





Model: RandomForest
Test Accuracy: 0.4960
Test Confusion Matrix:
[[208  12]
 [240  40]]
[LightGBM] [Info] Number of positive: 4472, number of negative: 4528
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000507 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 909
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496889 -> initscore=-0.012445
[LightGBM] [Info] Start training from score -0.012445

Model: LightGBM
Test Accuracy: 0.5700
Test Confusion Matrix:
[[147  73]
 [142 138]]





Model: Lasso
Test Accuracy: 0.5040
Test Confusion Matrix:
[[197  23]
 [225  55]]

Model: Ridge
Test Accuracy: 0.5120
Test Confusion Matrix:
[[201  19]
 [225  55]]





Model: SVM
Test Accuracy: 0.5200
Test Confusion Matrix:
[[220   0]
 [240  40]]

Model: KNN
Test Accuracy: 0.5140
Test Confusion Matrix:
[[163  57]
 [186  94]]





Model: GradientBoosting
Test Accuracy: 0.5260
Test Confusion Matrix:
[[168  52]
 [185  95]]





Model: AdaBoost
Test Accuracy: 0.4460
Test Confusion Matrix:
[[220   0]
 [277   3]]

Model: ExtraTrees
Test Accuracy: 0.5120
Test Confusion Matrix:
[[220   0]
 [244  36]]

Model: XGBoost
Test Accuracy: 0.5120
Test Confusion Matrix:
[[189  31]
 [213  67]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [14]:
# ------------------------------------------------------------------------------
# 9. Ensemble Majority Vote with Uncertainty Quantification
# ------------------------------------------------------------------------------
# For each model we have a trained pipeline in final_models.
# The ensemble prediction is computed by taking the mode (majority vote) of
# all model predictions. Uncertainty is quantified as the fraction of models
# that agree on the majority vote (e.g., if 3 out of 5 models agree, certainty is 60%).

# Ensemble on Validation Data
val_preds = []
for model_name, pipeline in final_models.items():
    preds = pipeline.predict(X_val)
    val_preds.append(preds)
val_preds = np.array(val_preds)  # shape: (n_models, n_samples)
ensemble_val_pred, count_val = mode(val_preds, axis=0)
ensemble_val_pred = ensemble_val_pred.flatten()
agreement_val = count_val.flatten() / len(final_models)  # fraction in agreement

ensemble_val_accuracy = accuracy_score(y_val, ensemble_val_pred)
ensemble_val_cm = confusion_matrix(y_val, ensemble_val_pred)

print("\n=== Ensemble (Majority Vote) on Validation Data ===")
print("Validation Accuracy:", ensemble_val_accuracy)
print("Validation Confusion Matrix:")
print(ensemble_val_cm)
print("Average Certainty (Validation):", np.mean(agreement_val))

# Ensemble on Test Data
test_preds = []
for model_name, pipeline in final_models.items():
    preds = pipeline.predict(X_test)
    test_preds.append(preds)
test_preds = np.array(test_preds)
ensemble_test_pred, count_test = mode(test_preds, axis=0)
ensemble_test_pred = ensemble_test_pred.flatten()
agreement_test = count_test.flatten() / len(final_models)  # fraction in agreement

ensemble_test_accuracy = accuracy_score(y_test, ensemble_test_pred)
ensemble_test_cm = confusion_matrix(y_test, ensemble_test_pred)

print("\n=== Ensemble (Majority Vote) on Test Data ===")
print("Test Accuracy:", ensemble_test_accuracy)
print("Test Confusion Matrix:")
print(ensemble_test_cm)
print("Average Certainty (Test):", np.mean(agreement_test))




=== Ensemble (Majority Vote) on Validation Data ===
Validation Accuracy: 0.578
Validation Confusion Matrix:
[[255   5]
 [206  34]]
Average Certainty (Validation): 0.8686

=== Ensemble (Majority Vote) on Test Data ===
Test Accuracy: 0.508
Test Confusion Matrix:
[[217   3]
 [243  37]]
Average Certainty (Test): 0.8716




In [None]:
# Set global random seed variable for reproducibility
SEED = 42
import random
import numpy as np
random.seed(SEED)
np.random.seed(SEED)

import pandas as pd

# Scikit-learn utilities
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

# Base models
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC

# Additional diverse models for tabular data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# For randomness in hyperparameter distributions and for ensemble uncertainty
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from scipy.stats import mode

# ------------------------------------------------------------------------------
# 1. Load pre-split datasets
# ------------------------------------------------------------------------------
# We assume you have three CSV files with the final splits:
#   - train_models_final.csv
#   - val_models_final.csv
#   - test_models_final.csv
#
# Each CSV contains many columns including text fields, metadata, etc.
# We will use only the following features (see below) and the target column "label_label".

train_df = pd.read_csv("train_models_final.csv")
val_df   = pd.read_csv("val_models_final.csv")
test_df  = pd.read_csv("test_models_final.csv")

# ------------------------------------------------------------------------------
# 2. Define feature lists and target
# ------------------------------------------------------------------------------
# Provided lists of features
categorical_features = [
    "gender", "country", "country_code", "nationality", "country_of_domicile",
    "marital_status", "inheritance_details_relationship", "investment_risk_profile",
    "investment_horizon", "investment_experience", "type_of_mandate", "currency"
]

numerical_features = [
    "aum_savings", "aum_inheritance", "aum_real_estate_value"
]

# Define the target column (assumed to be in "label_label")
target_col = "label_label"

# Convert target to binary: 1 if "Reject", 0 if "Accept" (adjust as needed)
train_df[target_col] = (train_df[target_col] == "Reject").astype(int)
val_df[target_col]   = (val_df[target_col]   == "Reject").astype(int)
test_df[target_col]  = (test_df[target_col]  == "Reject").astype(int)

# For modeling, select only the features we want
feature_columns = numerical_features + categorical_features

X_train = train_df[feature_columns]
y_train = train_df[target_col]

X_val = val_df[feature_columns]
y_val = val_df[target_col]

X_test = test_df[feature_columns]
y_test = test_df[target_col]

# ------------------------------------------------------------------------------
# 3. Build preprocessing pipelines for numerical and categorical features
# ------------------------------------------------------------------------------
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",  OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_features),
    ("cat", categorical_transformer, categorical_features),
])

# ------------------------------------------------------------------------------
# 4. Define candidate models (diverse and high-performing on tabular data)
# ------------------------------------------------------------------------------
models = {
    "RandomForest":    RandomForestClassifier(random_state=SEED),
    "LightGBM":        LGBMClassifier(random_state=SEED),
    "Lasso":           LogisticRegression(penalty='l1', solver='liblinear', random_state=SEED),
    "Ridge":           RidgeClassifier(random_state=SEED),
    "SVM":             SVC(probability=True, random_state=SEED),
    "KNN":             KNeighborsClassifier(),
    "GradientBoosting":GradientBoostingClassifier(random_state=SEED),
    "AdaBoost":        AdaBoostClassifier(random_state=SEED),
    "ExtraTrees":      ExtraTreesClassifier(random_state=SEED),
    "XGBoost":         XGBClassifier(random_state=SEED, use_label_encoder=False, eval_metric='logloss')
}

# ------------------------------------------------------------------------------
# 5. Define hyperparameter distributions for RandomizedSearchCV for each model
# ------------------------------------------------------------------------------
param_distributions = {
    "RandomForest": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__max_depth":    sp_randint(2, 15),
        "classifier__min_samples_split": sp_randint(2, 10),
    },
    "LightGBM": {
        "classifier__num_leaves": sp_randint(20, 50),
        "classifier__learning_rate": uniform(0.01, 0.3),
        "classifier__max_depth": sp_randint(2, 15),
    },
    "Lasso": {
        "classifier__C": uniform(0.001, 10.0),
    },
    "Ridge": {
        "classifier__alpha": uniform(0.001, 10.0),
    },
    "SVM": {
        "classifier__C": uniform(0.001, 10.0),
        "classifier__kernel": ["linear", "rbf"],
    },
    "KNN": {
        "classifier__n_neighbors": sp_randint(3, 20),
        "classifier__weights": ["uniform", "distance"],
        "classifier__p": [1, 2]
    },
    "GradientBoosting": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__learning_rate": uniform(0.01, 0.3),
        "classifier__max_depth": sp_randint(2, 10),
    },
    "AdaBoost": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__learning_rate": uniform(0.01, 1.0),
    },
    "ExtraTrees": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__max_depth": sp_randint(2, 15),
        "classifier__min_samples_split": sp_randint(2, 10),
    },
    "XGBoost": {
        "classifier__n_estimators": sp_randint(50, 200),
        "classifier__learning_rate": uniform(0.01, 0.3),
        "classifier__max_depth": sp_randint(2, 10),
    }
}

# Use K-fold cross-validation on the training set
kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)

# ------------------------------------------------------------------------------
# 6. Train models with hyperparameter search and evaluate on validation set
# ------------------------------------------------------------------------------
results = []

for model_name, model in models.items():
    print(f"\n\n=== Model: {model_name} ===")
    
    # Build pipeline: preprocessing + model
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    # Get hyperparameter distribution for this model
    search_params = param_distributions.get(model_name, {})
    
    # Perform RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=search_params,
        n_iter=10,  # number of parameter settings sampled
        scoring="accuracy",
        cv=kfold,
        random_state=SEED,
        verbose=1,
        n_jobs=-1  # use all available cores
    )
    
    random_search.fit(X_train, y_train)
    
    best_estimator = random_search.best_estimator_
    print(f"Best params: {random_search.best_params_}")
    print(f"CV best score (train set): {random_search.best_score_:.4f}")
    
    # Evaluate on validation set
    y_val_pred = best_estimator.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    cm_val = confusion_matrix(y_val, y_val_pred)
    
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print("Validation Confusion Matrix:")
    print(cm_val)
    
    results.append({
        "model": model_name,
        "best_params": random_search.best_params_,
        "cv_best_score": random_search.best_score_,
        "val_accuracy": val_accuracy,
        "val_confusion_matrix": cm_val
    })

# ------------------------------------------------------------------------------
# 7. Summarize validation results
# ------------------------------------------------------------------------------
results_df = pd.DataFrame(results)
print("\n=== Validation Results Summary ===")
print(results_df[["model", "cv_best_score", "val_accuracy", "best_params"]])

# ------------------------------------------------------------------------------
# 8. Final evaluation on the test set for each model
# ------------------------------------------------------------------------------
print("\n=== Test Evaluation ===")

# Store final pipelines in a dictionary for ensembling later
final_models = {}

for model_name, model in models.items():
    # Rebuild pipeline with best hyperparameters for the model
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    best_params = next((res["best_params"] for res in results if res["model"] == model_name), {})
    pipeline.set_params(**best_params)
    
    pipeline.fit(X_train, y_train)
    final_models[model_name] = pipeline
    
    y_test_pred = pipeline.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_cm = confusion_matrix(y_test, y_test_pred)
    
    print(f"\nModel: {model_name}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print("Test Confusion Matrix:")
    print(test_cm)

# ------------------------------------------------------------------------------
# 9. Ensemble Majority Vote with Uncertainty Quantification
# ------------------------------------------------------------------------------
# For each model we have a trained pipeline in final_models.
# The ensemble prediction is computed by taking the mode (majority vote) of
# all model predictions. Uncertainty is quantified as the fraction of models
# that agree on the majority vote (e.g., if 3 out of 5 models agree, certainty is 60%).

# Ensemble on Validation Data
val_preds = []
for model_name, pipeline in final_models.items():
    preds = pipeline.predict(X_val)
    val_preds.append(preds)
val_preds = np.array(val_preds)  # shape: (n_models, n_samples)
ensemble_val_pred, count_val = mode(val_preds, axis=0)
ensemble_val_pred = ensemble_val_pred.flatten()
agreement_val = count_val.flatten() / len(final_models)  # fraction in agreement

ensemble_val_accuracy = accuracy_score(y_val, ensemble_val_pred)
ensemble_val_cm = confusion_matrix(y_val, ensemble_val_pred)

print("\n=== Ensemble (Majority Vote) on Validation Data ===")
print("Validation Accuracy:", ensemble_val_accuracy)
print("Validation Confusion Matrix:")
print(ensemble_val_cm)
print("Average Certainty (Validation):", np.mean(agreement_val))

# Ensemble on Test Data
test_preds = []
for model_name, pipeline in final_models.items():
    preds = pipeline.predict(X_test)
    test_preds.append(preds)
test_preds = np.array(test_preds)
ensemble_test_pred, count_test = mode(test_preds, axis=0)
ensemble_test_pred = ensemble_test_pred.flatten()
agreement_test = count_test.flatten() / len(final_models)  # fraction in agreement

ensemble_test_accuracy = accuracy_score(y_test, ensemble_test_pred)
ensemble_test_cm = confusion_matrix(y_test, ensemble_test_pred)

print("\n=== Ensemble (Majority Vote) on Test Data ===")
print("Test Accuracy:", ensemble_test_accuracy)
print("Test Confusion Matrix:")
print(ensemble_test_cm)
print("Average Certainty (Test):", np.mean(agreement_test))
