In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer

df = pd.read_csv("../data/df.csv", index_col=False)


numerical_features_full = [
    "n_items", "total_price", "total_freight", "avg_price",
    "payment_value", "seller_avg_score_past",
    "purchase_to_estimated_days",
]

numerical_features_reduced = [
    "n_items", "total_price", "total_freight", "avg_price",
    "payment_value",
    "purchase_to_estimated_days",
]

ordinal_linear = ["payment_installments"]
ordinal_cyclic = ["purchase_month", "purchase_dow"]
categorical_features = ["main_payment_type", "same_state", "customer_state", "seller_state"]

X = df.drop(columns=["is_late"])
y = df["is_late"]
groups = df["customer_unique_id"]

X_other, X_test, y_other, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train+Val size: {X_other.shape}, Test size: {X_test.shape}")

def encode_cyclic_features(df_sub):
    df_sub = df_sub.copy()
    return pd.DataFrame({
        "month_sin": np.sin(2 * np.pi * df_sub["purchase_month"] / 12),
        "month_cos": np.cos(2 * np.pi * df_sub["purchase_month"] / 12),
        "dow_sin": np.sin(2 * np.pi * df_sub["purchase_dow"] / 7),
        "dow_cos": np.cos(2 * np.pi * df_sub["purchase_dow"] / 7),
    })

preprocessor_full = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features_full + ordinal_linear),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("cyclic", FunctionTransformer(encode_cyclic_features), ordinal_cyclic),
    ]
)

preprocessor_reduced = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features_reduced + ordinal_linear),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ("cyclic", FunctionTransformer(encode_cyclic_features), ordinal_cyclic),
    ]
)

print("Two preprocessors defined: preprocessor_full & preprocessor_reduced")

Train+Val size: (78254, 15), Test size: (19564, 15)
Two preprocessors defined: preprocessor_full & preprocessor_reduced


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, classification_report, roc_auc_score
)
from sklearn.base import clone
from tqdm import tqdm
import numpy as np
import joblib

# ========= FULL FEATURE KNN =========

n_runs = 3
base_seed = 42

test_f1_list = []
best_global_test_f1 = -1.0
best_run_idx = None

best_preprocessor_full = None
best_model_full = None
best_params_full = None
best_val_f1_full = None
best_X_test_full_pre = None
best_y_test_full = None

for i in range(1, n_runs + 1):
    print(f"\n KNN FULL Run {i} / {n_runs}")
    seed = base_seed * i

    X_other, X_test, y_other, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    mask_full_other = X_other["seller_avg_score_past"].notna()
    mask_full_test = X_test["seller_avg_score_past"].notna()

    X_other_full = X_other[mask_full_other].copy()
    y_other_full = y_other[mask_full_other].copy()
    X_test_full = X_test[mask_full_test].copy()
    y_test_full = y_test[mask_full_test].copy()

    print(f"[Run {i}] Train+Val rows (no missing): {X_other_full.shape[0]}")
    print(f"[Run {i}] Test rows (no missing):      {X_test_full.shape[0]}")

    X_train_full_raw, X_val_full_raw, y_train_full, y_val_full = train_test_split(
        X_other_full, y_other_full, test_size=0.25,
        random_state=seed, stratify=y_other_full
    )

    preproc_full_run = clone(preprocessor_full)
    preproc_full_run.fit(X_train_full_raw)

    X_train_full = preproc_full_run.transform(X_train_full_raw)
    X_val_full = preproc_full_run.transform(X_val_full_raw)
    X_test_full_pre = preproc_full_run.transform(X_test_full)

    print(f"[Run {i}] Train shape: {X_train_full.shape}")
    print(f"[Run {i}] Val shape  : {X_val_full.shape}")
    print(f"[Run {i}] Test shape : {X_test_full_pre.shape}")

    param_grid_full = {
        "n_neighbors": [3, 5, 11, 21],
        "weights": ["uniform", "distance"],
        "p": [1, 2],  # Manhattan / Euclidean
    }
    param_list_full = list(ParameterGrid(param_grid_full))
    print(f"[Run {i}] Total param combinations: {len(param_list_full)}")

    best_full_model_run = None
    best_full_params_run = None
    best_full_f1_run = -1.0

    for params in tqdm(param_list_full, desc=f"Searching KNN FULL - run {i}"):
        model = KNeighborsClassifier(**params)
        model.fit(X_train_full, y_train_full)

        y_val_pred = model.predict(X_val_full)
        val_f1 = f1_score(y_val_full, y_val_pred)

        if val_f1 > best_full_f1_run:
            best_full_f1_run = val_f1
            best_full_params_run = params
            best_full_model_run = model

    print(f"\n[Run {i}] Best params on val:", best_full_params_run)
    print(f"[Run {i}] Best Val F1: {best_full_f1_run:.4f}")

    y_test_full_pred = best_full_model_run.predict(X_test_full_pre)
    test_f1 = f1_score(y_test_full, y_test_full_pred)
    test_acc = accuracy_score(y_test_full, y_test_full_pred)
    test_prec = precision_score(y_test_full, y_test_full_pred, zero_division=0)
    test_rec = recall_score(y_test_full, y_test_full_pred, zero_division=0)
    cm = confusion_matrix(y_test_full, y_test_full_pred)
    test_proba = best_full_model_run.predict_proba(X_test_full_pre)[:, 1]
    test_auc = roc_auc_score(y_test_full, test_proba)

    print(f"\n[Run {i}] Test F1: {test_f1:.4f}")
    test_f1_list.append(test_f1)

    if test_f1 > best_global_test_f1:
        best_global_test_f1 = test_f1
        best_run_idx = i
        best_preprocessor_full = preproc_full_run
        best_model_full = best_full_model_run
        best_params_full = best_full_params_run
        best_val_f1_full = best_full_f1_run
        best_X_test_full_pre = X_test_full_pre
        best_y_test_full = y_test_full

        print("\n--> NEW GLOBAL BEST (FULL)!")
        print("Confusion matrix [[TN, FP], [FN, TP]]:")
        print(cm)
        print(classification_report(y_test_full, y_test_full_pred))

test_f1_arr = np.array(test_f1_list)
print("\n================== FULL Summary ==================")
print("Test F1 per run:", test_f1_list)
print(f"Mean: {test_f1_arr.mean():.4f}, Std: {test_f1_arr.std():.4f}")
print(f"Best run index: {best_run_idx}, Best F1: {best_global_test_f1:.4f}")

knn_full_artifacts = {
    "preprocessor_full": best_preprocessor_full,
    "model_full": best_model_full,
    "best_params_full": best_params_full,
    "best_val_f1_full": best_val_f1_full,
    "best_test_f1_full": best_global_test_f1,
    "test_f1_all_runs": test_f1_list,
}

joblib.dump(knn_full_artifacts, "best_knn_full_model_3_run.pkl")
print("\nSaved FULL-feature KNN model → best_knn_full_model_3_run.pkl")


[Run 1] Train+Val rows (no missing): 73486
[Run 1] Test rows (no missing):      18394
[Run 1] Train shape: (55114, 66)
[Run 1] Val shape  : (18372, 66)
[Run 1] Test shape : (18394, 66)
[Run 1] Total param combinations: 16


Searching KNN FULL - run 1: 100%|██████████| 16/16 [00:39<00:00,  2.47s/it]



[Run 1] Best params on val: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
[Run 1] Best Val F1: 0.1709

[Run 1] Test F1: 0.1617

--> NEW GLOBAL BEST (FULL)!
Confusion matrix [[TN, FP], [FN, TP]]:
[[16371   551]
 [ 1294   178]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     16922
           1       0.24      0.12      0.16      1472

    accuracy                           0.90     18394
   macro avg       0.59      0.54      0.55     18394
weighted avg       0.87      0.90      0.88     18394


[Run 2] Train+Val rows (no missing): 73587
[Run 2] Test rows (no missing):      18293
[Run 2] Train shape: (55190, 66)
[Run 2] Val shape  : (18397, 66)
[Run 2] Test shape : (18293, 66)
[Run 2] Total param combinations: 16


Searching KNN FULL - run 2: 100%|██████████| 16/16 [00:39<00:00,  2.46s/it]



[Run 2] Best params on val: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
[Run 2] Best Val F1: 0.1613

[Run 2] Test F1: 0.1680

--> NEW GLOBAL BEST (FULL)!
Confusion matrix [[TN, FP], [FN, TP]]:
[[16385   457]
 [ 1276   175]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     16842
           1       0.28      0.12      0.17      1451

    accuracy                           0.91     18293
   macro avg       0.60      0.55      0.56     18293
weighted avg       0.88      0.91      0.89     18293


[Run 3] Train+Val rows (no missing): 73482
[Run 3] Test rows (no missing):      18398
[Run 3] Train shape: (55111, 66)
[Run 3] Val shape  : (18371, 66)
[Run 3] Test shape : (18398, 66)
[Run 3] Total param combinations: 16


Searching KNN FULL - run 3: 100%|██████████| 16/16 [00:41<00:00,  2.57s/it]



[Run 3] Best params on val: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
[Run 3] Best Val F1: 0.1698

[Run 3] Test F1: 0.1455

Test F1 per run: [0.16174466151749206, 0.16802688430148824, 0.14552417424605074]
Mean: 0.1584, Std: 0.0095
Best run index: 2, Best F1: 0.1680

Saved FULL-feature KNN model → best_knn_full_model_3_run.pkl


In [4]:
n_runs_red = 3
test_f1_list_red = []
best_global_test_f1_red = -1.0
best_run_idx_red = None

best_preprocessor_red = None
best_model_red = None
best_params_red = None
best_val_f1_red = None
best_X_test_red_pre = None
best_y_test_red = None

for i in range(1, n_runs_red + 1):
    print(f"\n================== KNN REDUCED Run {i} / {n_runs_red} ==================")
    seed = base_seed * i

    X_other, X_test, y_other, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )

    mask_red_other = X_other["seller_avg_score_past"].isna()
    mask_red_test = X_test["seller_avg_score_past"].isna()

    X_other_red = X_other[mask_red_other].copy()
    y_other_red = y_other[mask_red_other].copy()
    X_test_red = X_test[mask_red_test].copy()
    y_test_red = y_test[mask_red_test].copy()

    print(f"[Run {i}] Train+Val rows (missing): {X_other_red.shape[0]}")
    print(f"[Run {i}] Test rows (missing):      {X_test_red.shape[0]}")

    if X_other_red.shape[0] < 50 or y_other_red.nunique() < 2:
        print("Too few samples → skip this run.")
        continue

    X_train_red_raw, X_val_red_raw, y_train_red, y_val_red = train_test_split(
        X_other_red, y_other_red, test_size=0.25,
        random_state=seed, stratify=y_other_red
    )

    preproc_red_run = clone(preprocessor_reduced)
    preproc_red_run.fit(X_train_red_raw)

    X_train_red = preproc_red_run.transform(X_train_red_raw)
    X_val_red = preproc_red_run.transform(X_val_red_raw)
    X_test_red_pre = preproc_red_run.transform(X_test_red)

    print(f"[Run {i}] Train shape: {X_train_red.shape}")
    print(f"[Run {i}] Val shape  : {X_val_red.shape}")
    print(f"[Run {i}] Test shape : {X_test_red_pre.shape}")

    param_grid_red = {
        "n_neighbors": [1,2,3, 5, 11, 21],
        "weights": ["uniform", "distance"],
        "p": [1, 2],
    }
    param_list_red = list(ParameterGrid(param_grid_red))
    print(f"[Run {i}] Total param combinations: {len(param_list_red)}")

    best_red_model_run = None
    best_red_params_run = None
    best_red_f1_run = -1.0

    for params in tqdm(param_list_red, desc=f"Searching KNN REDUCED - run {i}"):
        model = KNeighborsClassifier(**params)
        model.fit(X_train_red, y_train_red)

        y_val_pred_red = model.predict(X_val_red)
        f1_red = f1_score(y_val_red, y_val_pred_red)

        if f1_red > best_red_f1_run:
            best_red_f1_run = f1_red
            best_red_params_run = params
            best_red_model_run = model

    print(f"\n[Run {i}] Best params:", best_red_params_run)
    print(f"[Run {i}] Best Val F1: {best_red_f1_run:.4f}")

    y_test_red_pred = best_red_model_run.predict(X_test_red_pre)
    test_f1_red = f1_score(y_test_red, y_test_red_pred)
    test_acc_red = accuracy_score(y_test_red, y_test_red_pred)
    test_prec_red = precision_score(y_test_red, y_test_red_pred, zero_division=0)
    test_rec_red = recall_score(y_test_red, y_test_red_pred, zero_division=0)
    cm_red = confusion_matrix(y_test_red, y_test_red_pred)
    test_proba_red = best_red_model_run.predict_proba(X_test_red_pre)[:, 1]
    test_auc_red = roc_auc_score(y_test_red, test_proba_red)

    print(f"[Run {i}] Test F1: {test_f1_red:.4f}")
    test_f1_list_red.append(test_f1_red)

    if test_f1_red > best_global_test_f1_red:
        best_global_test_f1_red = test_f1_red
        best_run_idx_red = i
        best_preprocessor_red = preproc_red_run
        best_model_red = best_red_model_run
        best_params_red = best_red_params_run
        best_val_f1_red = best_red_f1_run
        best_X_test_red_pre = X_test_red_pre
        best_y_test_red = y_test_red

        print("\n--> NEW GLOBAL BEST REDUCED!")
        print("Confusion matrix [[TN, FP], [FN, TP]]:")
        print(cm_red)
        print(classification_report(y_test_red, y_test_red_pred))

test_f1_arr_red = np.array(test_f1_list_red)
print("\n================== REDUCED Summary ==================")
print("Test F1 per run:", test_f1_list_red)
if len(test_f1_arr_red) > 0:
    print(f"Mean: {test_f1_arr_red.mean():.4f}, Std: {test_f1_arr_red.std():.4f}")

knn_reduced_artifacts = {
    "preprocessor_reduced": best_preprocessor_red,
    "model_reduced": best_model_red,
    "best_params_reduced": best_params_red,
    "best_val_f1_reduced": best_val_f1_red,
    "best_test_f1_reduced": best_global_test_f1_red,
    "test_f1_all_runs_reduced": test_f1_list_red,
}

joblib.dump(knn_reduced_artifacts, "best_knn_reduced_model_3_run.pkl")
print("\nSaved REDUCED-feature KNN model → best_knn_reduced_model_3_run.pkl")


[Run 1] Train+Val rows (missing): 4768
[Run 1] Test rows (missing):      1170
[Run 1] Train shape: (3576, 64)
[Run 1] Val shape  : (1192, 64)
[Run 1] Test shape : (1170, 64)
[Run 1] Total param combinations: 24


Searching KNN REDUCED - run 1: 100%|██████████| 24/24 [00:00<00:00, 39.53it/s]



[Run 1] Best params: {'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
[Run 1] Best Val F1: 0.1268
[Run 1] Test F1: 0.1451

--> NEW GLOBAL BEST REDUCED!
Confusion matrix [[TN, FP], [FN, TP]]:
[[991  82]
 [ 83  14]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1073
           1       0.15      0.14      0.15        97

    accuracy                           0.86      1170
   macro avg       0.53      0.53      0.53      1170
weighted avg       0.86      0.86      0.86      1170


[Run 2] Train+Val rows (missing): 4667
[Run 2] Test rows (missing):      1271
[Run 2] Train shape: (3500, 64)
[Run 2] Val shape  : (1167, 64)
[Run 2] Test shape : (1271, 64)
[Run 2] Total param combinations: 24


Searching KNN REDUCED - run 2: 100%|██████████| 24/24 [00:00<00:00, 43.53it/s]



[Run 2] Best params: {'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
[Run 2] Best Val F1: 0.1461
[Run 2] Test F1: 0.1590

--> NEW GLOBAL BEST REDUCED!
Confusion matrix [[TN, FP], [FN, TP]]:
[[1051  102]
 [  99   19]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1153
           1       0.16      0.16      0.16       118

    accuracy                           0.84      1271
   macro avg       0.54      0.54      0.54      1271
weighted avg       0.84      0.84      0.84      1271


[Run 3] Train+Val rows (missing): 4772
[Run 3] Test rows (missing):      1166
[Run 3] Train shape: (3579, 64)
[Run 3] Val shape  : (1193, 64)
[Run 3] Test shape : (1166, 64)
[Run 3] Total param combinations: 24


Searching KNN REDUCED - run 3: 100%|██████████| 24/24 [00:00<00:00, 42.31it/s]


[Run 3] Best params: {'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
[Run 3] Best Val F1: 0.1649
[Run 3] Test F1: 0.1747

--> NEW GLOBAL BEST REDUCED!
Confusion matrix [[TN, FP], [FN, TP]]:
[[957  81]
 [108  20]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1038
           1       0.20      0.16      0.17       128

    accuracy                           0.84      1166
   macro avg       0.55      0.54      0.54      1166
weighted avg       0.82      0.84      0.83      1166


Test F1 per run: [0.14507772020725387, 0.1589958158995816, 0.17467248908296942]
Mean: 0.1596, Std: 0.0121

Saved REDUCED-feature KNN model → best_knn_reduced_model_3_run.pkl



