In [0]:
!pip install lightgbm
%restart_python

In [0]:
# Cell 1: Imports and Utilities
# Purpose: Import essential libraries for Spark, LightGBM, sklearn, matplotlib, and seaborn.
# Note: The one_hot_encode function is commented out and not used in the current pipeline. It was removed for simplicity and performance.
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# def one_hot_encode(df, input_col, output_col_prefix):
#     indexer = StringIndexer(inputCol=input_col, outputCol=f"{input_col}_index")
#     encoder = OneHotEncoder(inputCol=f"{input_col}_index", outputCol=f"{output_col_prefix}_vec")
#     pipeline = Pipeline(stages=[indexer, encoder])
#     model = pipeline.fit(df)
#     encoded_df = model.transform(df)
#     return encoded_df

# Example usage:
# encoded_df = one_hot_encode(df, "category_column", "category")
# display(encoded_df)


In [0]:
# Cell 2: Timer Context Manager
# Purpose: Defines a timer context manager for profiling code execution, useful for measuring the runtime of pipeline steps.
from contextlib import contextmanager
import time
import gc

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))


In [0]:
# Cell 3: Application Data Feature Engineering
# Purpose: Loads training and test data from Spark tables, processes missing values, creates new features, and merges train/test sets for modeling.
# Note: Spark one-hot encoding for binary features is commented out and not used. Numeric-only pipeline for simplicity and performance.
def application_train_test(num_rows=None, nan_as_category=True):
    df = spark.table("creditrisk_catalog.silver_creditrisk.app_train")
    test_df = spark.table("creditrisk_catalog.silver_creditrisk.app_test")
    if num_rows:
        df = df.limit(num_rows)
        test_df = test_df.limit(num_rows)
    print("Train samples: {}, test samples: {}".format(df.count(), test_df.count()))
    test_df = test_df.withColumn("TARGET", F.lit(None))
    df = df.unionByName(test_df)
    # Remove Spark one-hot encoding
    # for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    #     df = one_hot_encode(df, bin_feature, bin_feature)
    df = df.withColumn('DAYS_EMPLOYED', F.when(df['DAYS_EMPLOYED'] == 365243, None).otherwise(df['DAYS_EMPLOYED']))
    df = df.withColumn('DAYS_EMPLOYED_PERC', F.try_divide(df['DAYS_EMPLOYED'], df['DAYS_BIRTH'])) \
           .withColumn('INCOME_CREDIT_PERC', F.try_divide(df['AMT_INCOME_TOTAL'], df['AMT_CREDIT'])) \
           .withColumn('INCOME_PER_PERSON', F.try_divide(df['AMT_INCOME_TOTAL'], df['CNT_FAM_MEMBERS'])) \
           .withColumn('ANNUITY_INCOME_PERC', F.try_divide(df['AMT_ANNUITY'], df['AMT_INCOME_TOTAL']))
    return df


In [0]:
# Cell 4: Bureau and Bureau Balance Feature Engineering
# Purpose: Aggregates bureau and bureau_balance tables, engineers features via groupBy and aggregation, and joins results for downstream use.
# Note: Spark one-hot encoding and categorical feature aggregation are commented out and not used. Numeric-only pipeline for simplicity and performance.
def bureau_and_balance(num_rows=None, nan_as_category=True):
    bureau = spark.table("creditrisk_catalog.silver_creditrisk.bureau")
    bb = spark.table("creditrisk_catalog.silver_creditrisk.bureau_balance")
    if num_rows:
        bureau = bureau.limit(num_rows)
        bb = bb.limit(num_rows)
    # Remove Spark one-hot encoding
    # bb = one_hot_encode(bb, "STATUS", "STATUS")
    # bureau = one_hot_encode(bureau, "CREDIT_ACTIVE", "CREDIT_ACTIVE")
    bb_agg_exprs = [
        F.min("MONTHS_BALANCE").alias("MONTHS_BALANCE_MIN"),
        F.max("MONTHS_BALANCE").alias("MONTHS_BALANCE_MAX"),
        F.count("MONTHS_BALANCE").alias("MONTHS_BALANCE_SIZE"),
    ]
    bb_agg = bb.groupBy("SK_ID_BUREAU").agg(*bb_agg_exprs)
    bureau = bureau.join(bb_agg, on="SK_ID_BUREAU", how="left")
    num_aggregations = {
        'DAYS_CREDIT': [F.min, F.max, F.mean, F.variance],
        'CREDIT_DAY_OVERDUE': [F.max, F.mean],
        'DAYS_CREDIT_ENDDATE': [F.min, F.max, F.mean],
        'AMT_CREDIT_MAX_OVERDUE': [F.mean],
        'CNT_CREDIT_PROLONG': [F.sum],
        'AMT_CREDIT_SUM': [F.max, F.mean, F.sum],
        'AMT_CREDIT_SUM_DEBT': [F.max, F.mean, F.sum],
        'AMT_CREDIT_SUM_OVERDUE': [F.mean],
        'AMT_CREDIT_SUM_LIMIT': [F.mean, F.sum],
        'DAYS_CREDIT_UPDATE': [F.min, F.max, F.mean],
        'AMT_ANNUITY': [F.max, F.mean],
        'MONTHS_BALANCE_MIN': [F.min],
        'MONTHS_BALANCE_MAX': [F.max],
        'MONTHS_BALANCE_SIZE': [F.mean, F.sum]
    }
    agg_exprs = []
    for col, funcs in num_aggregations.items():
        for func in funcs:
            agg_exprs.append(func(col).alias(f"BURO_{col}_{func.__name__.upper()}"))
    # Remove categorical feature aggregation
    # cat_features = ['CREDIT_ACTIVE_vec', 'CREDIT_CURRENCY_vec', 'CREDIT_TYPE_vec']
    # for cat in cat_features:
    #     agg_exprs.append(F.mean(cat).alias(f"BURO_{cat}_MEAN"))
    bureau_agg = bureau.groupBy("SK_ID_CURR").agg(*agg_exprs)
    # Remove active and closed credit aggregations that use CREDIT_ACTIVE_vec[1] and [2]
    # active = bureau.filter(F.col('CREDIT_ACTIVE_vec')[1] == 1)
    # active_agg = active.groupBy("SK_ID_CURR").agg(*[
    #     func(col).alias(f"ACT_{col}_{func.__name__.upper()}")
    #     for col, funcs in num_aggregations.items() for func in funcs
    # ])
    # bureau_agg = bureau_agg.join(active_agg, on="SK_ID_CURR", how="left")
    # closed = bureau.filter(F.col('CREDIT_ACTIVE_vec')[2] == 1)
    # closed_agg = closed.groupBy("SK_ID_CURR").agg(*[
    #     func(col).alias(f"CLS_{col}_{func.__name__.upper()}")
    #     for col, funcs in num_aggregations.items() for func in funcs
    # ])
    # bureau_agg = bureau_agg.join(closed_agg, on="SK_ID_CURR", how="left")
    return bureau_agg


In [0]:
# Cell 5: Previous Applications Feature Engineering
# Purpose: Processes previous loan applications, handles missing values, engineers features, and aggregates by customer ID.
# Note: Spark one-hot encoding and categorical aggregations are commented out and not used. Numeric-only pipeline for simplicity and performance.
def previous_applications(num_rows=None, nan_as_category=True):
    prev = spark.table("creditrisk_catalog.silver_creditrisk.previous_application")
    if num_rows:
        prev = prev.limit(num_rows)
    # prev = one_hot_encode(prev, "NAME_CONTRACT_STATUS", "NAME_CONTRACT_STATUS")
    #365243 is a place holder for missing values
    prev = prev.withColumn('DAYS_FIRST_DRAWING', F.when(prev['DAYS_FIRST_DRAWING'] == 365243, None).otherwise(prev['DAYS_FIRST_DRAWING']))
    prev = prev.withColumn('DAYS_FIRST_DUE', F.when(prev['DAYS_FIRST_DUE'] == 365243, None).otherwise(prev['DAYS_FIRST_DUE']))
    prev = prev.withColumn('DAYS_LAST_DUE_1ST_VERSION', F.when(prev['DAYS_LAST_DUE_1ST_VERSION'] == 365243, None).otherwise(prev['DAYS_LAST_DUE_1ST_VERSION']))
    prev = prev.withColumn('DAYS_LAST_DUE', F.when(prev['DAYS_LAST_DUE'] == 365243, None).otherwise(prev['DAYS_LAST_DUE']))
    prev = prev.withColumn('DAYS_TERMINATION', F.when(prev['DAYS_TERMINATION'] == 365243, None).otherwise(prev['DAYS_TERMINATION']))
    # Update division to use F.try_divide
    prev = prev.withColumn('APP_CREDIT_PERC', F.try_divide(prev['AMT_APPLICATION'], prev['AMT_CREDIT']))

    num_aggregations = {
        'AMT_ANNUITY': [F.min, F.max, F.mean],
        'AMT_APPLICATION': [F.min, F.max, F.mean],
        'AMT_CREDIT': [F.min, F.max, F.mean],
        'APP_CREDIT_PERC': [F.min, F.max, F.mean, F.variance],
        'AMT_DOWN_PAYMENT': [F.min, F.max, F.mean],
        'AMT_GOODS_PRICE': [F.min, F.max, F.mean],
        'HOUR_APPR_PROCESS_START': [F.min, F.max, F.mean],
        'RATE_DOWN_PAYMENT': [F.min, F.max, F.mean],
        'DAYS_DECISION': [F.min, F.max, F.mean],
        'CNT_PAYMENT': [F.mean, F.sum],
    }

    agg_exprs = []
    for col, funcs in num_aggregations.items():
        for func in funcs:
            agg_exprs.append(func(col).alias(f"PREV_{col}_{func.__name__.upper()}"))

    # cat_features = [c for c in prev.columns if c.startswith("NAME_CONTRACT_STATUS_vec")]
    # for cat in cat_features:
    #     agg_exprs.append(F.mean(cat).alias(f"PREV_{cat}_MEAN"))

    prev_agg = prev.groupBy("SK_ID_CURR").agg(*agg_exprs)

    # approved = prev.filter(F.col('NAME_CONTRACT_STATUS_vec')[1] == 1)
    # approved_agg = approved.groupBy("SK_ID_CURR").agg(*[
    #     func(col).alias(f"APR_{col}_{func.__name__.upper()}")
    #     for col, funcs in num_aggregations.items() for func in funcs
    # ])
    # prev_agg = prev_agg.join(approved_agg, on="SK_ID_CURR", how="left")

    # refused = prev.filter(F.col('NAME_CONTRACT_STATUS_vec')[2] == 1)
    # refused_agg = refused.groupBy("SK_ID_CURR").agg(*[
    #     func(col).alias(f"REF_{col}_{func.__name__.upper()}")
    #     for col, funcs in num_aggregations.items() for func in funcs
    # ])
    # prev_agg = prev_agg.join(refused_agg, on="SK_ID_CURR", how="left")

    return prev_agg


In [0]:
# Cell 6: POS Cash Balance Feature Engineering
# Purpose: Processes POS cash balance data, aggregates time and delinquency features, and prepares them for joining.
# Note: Spark one-hot encoding and categorical aggregations are commented out and not used. Numeric-only pipeline for simplicity and performance.
def pos_cash(num_rows=None, nan_as_category=True):
    pos = spark.table("creditrisk_catalog.silver_creditrisk.pos_cash_balance")
    if num_rows:
        pos = pos.limit(num_rows)
    # pos = one_hot_encode(pos, "NAME_CONTRACT_STATUS", "NAME_CONTRACT_STATUS")
    agg_exprs = [
        F.max("MONTHS_BALANCE").alias("POS_MONTHS_BALANCE_MAX"),
        F.mean("MONTHS_BALANCE").alias("POS_MONTHS_BALANCE_MEAN"),
        F.count("MONTHS_BALANCE").alias("POS_MONTHS_BALANCE_SIZE"),
        F.max("SK_DPD").alias("POS_SK_DPD_MAX"),
        F.mean("SK_DPD").alias("POS_SK_DPD_MEAN"),
        F.max("SK_DPD_DEF").alias("POS_SK_DPD_DEF_MAX"),
        F.mean("SK_DPD_DEF").alias("POS_SK_DPD_DEF_MEAN"),
    ]
    # cat_features = [c for c in pos.columns if c.startswith("NAME_CONTRACT_STATUS_vec")]
    # for cat in cat_features:
    #     agg_exprs.append(F.mean(cat).alias(f"POS_{cat}_MEAN"))
    pos_agg = pos.groupBy("SK_ID_CURR").agg(*agg_exprs)
    pos_agg = pos_agg.withColumn("POS_COUNT", F.lit(None))
    return pos_agg


In [0]:
# Cell 7: Bureau Balance (Alternate Version)
# Purpose: Alternate version of bureau_and_balance, similar to Cell 4, with minor code differences in categorical feature handling and aggregation.
# Note: Spark one-hot encoding and categorical feature aggregation are commented out and not used. Numeric-only pipeline for simplicity and performance.
def bureau_and_balance(num_rows=None, nan_as_category=True):
    bureau = spark.table("creditrisk_catalog.silver_creditrisk.bureau")
    bb = spark.table("creditrisk_catalog.silver_creditrisk.bureau_balance")
    if num_rows:
        bureau = bureau.limit(num_rows)
        bb = bb.limit(num_rows)
    # bb = one_hot_encode(bb, "STATUS", "STATUS")
    # bureau = one_hot_encode(bureau, "CREDIT_ACTIVE", "CREDIT_ACTIVE")
    bb_agg_exprs = [
        F.min("MONTHS_BALANCE").alias("MONTHS_BALANCE_MIN"),
        F.max("MONTHS_BALANCE").alias("MONTHS_BALANCE_MAX"),
        F.count("MONTHS_BALANCE").alias("MONTHS_BALANCE_SIZE"),
    ]
    bb_agg = bb.groupBy("SK_ID_BUREAU").agg(*bb_agg_exprs)
    bureau = bureau.join(bb_agg, on="SK_ID_BUREAU", how="left")
    num_aggregations = {
        'DAYS_CREDIT': [F.min, F.max, F.mean, F.variance],
        'CREDIT_DAY_OVERDUE': [F.max, F.mean],
        'DAYS_CREDIT_ENDDATE': [F.min, F.max, F.mean],
        'AMT_CREDIT_MAX_OVERDUE': [F.mean],
        'CNT_CREDIT_PROLONG': [F.sum],
        'AMT_CREDIT_SUM': [F.max, F.mean, F.sum],
        'AMT_CREDIT_SUM_DEBT': [F.max, F.mean, F.sum],
        'AMT_CREDIT_SUM_OVERDUE': [F.mean],
        'AMT_CREDIT_SUM_LIMIT': [F.mean, F.sum],
        'DAYS_CREDIT_UPDATE': [F.min, F.max, F.mean],
        'AMT_ANNUITY': [F.max, F.mean],
        'MONTHS_BALANCE_MIN': [F.min],
        'MONTHS_BALANCE_MAX': [F.max],
        'MONTHS_BALANCE_SIZE': [F.mean, F.sum]
    }
    agg_exprs = []
    for col, funcs in num_aggregations.items():
        for func in funcs:
            agg_exprs.append(func(col).alias(f"BURO_{col}_{func.__name__.upper()}"))
    # bureau_cat_features = [c for c in bureau.columns if c.startswith("CREDIT_ACTIVE_vec")]
    # for cat in bureau_cat_features:
    #     agg_exprs.append(F.mean(cat).alias(f"BURO_{cat}_MEAN"))
    bureau_agg = bureau.groupBy("SK_ID_CURR").agg(*agg_exprs)
    # Minimal fix: Remove active and closed credit aggregations that use CREDIT_ACTIVE_vec[1] and [2]
    # active = bureau.filter(F.col("CREDIT_ACTIVE_vec")[1] == 1)
    # active_agg_exprs = []
    # for col, funcs in num_aggregations.items():
    #     for func in funcs:
    #         active_agg_exprs.append(func(col).alias(f"ACT_{col}_{func.__name__.upper()}"))
    # active_agg = active.groupBy("SK_ID_CURR").agg(*active_agg_exprs)
    # bureau_agg = bureau_agg.join(active_agg, on="SK_ID_CURR", how="left")
    # closed = bureau.filter(F.col("CREDIT_ACTIVE_vec")[2] == 1)
    # closed_agg_exprs = []
    # for col, funcs in num_aggregations.items():
    #     for func in funcs:
    #         closed_agg_exprs.append(func(col).alias(f"CLS_{col}_{func.__name__.upper()}"))
    # closed_agg = closed.groupBy("SK_ID_CURR").agg(*closed_agg_exprs)
    # bureau_agg = bureau_agg.join(closed_agg, on="SK_ID_CURR", how="left")
    return bureau_agg


In [0]:
# Cell 8: Installments Payments Feature Engineering
# Purpose: Processes installment payment data, engineers payment and delinquency features, and aggregates by customer ID.
# Note: Spark one-hot encoding and categorical aggregations are commented out and not used. Numeric-only pipeline for simplicity and performance.
def installments_payments(num_rows=None, nan_as_category=True):
    ins = spark.table("creditrisk_catalog.silver_creditrisk.installments_payments")
    if num_rows:
        ins = ins.limit(num_rows)
    # Remove Spark one-hot encoding
    # ins = one_hot_encode(ins, "NAME_CONTRACT_TYPE", "NAME_CONTRACT_TYPE")
    ins = ins.withColumn("PAYMENT_PERC", F.try_divide(F.col("AMT_PAYMENT"), F.col("AMT_INSTALMENT")))
    ins = ins.withColumn("PAYMENT_DIFF", F.col("AMT_INSTALMENT") - F.col("AMT_PAYMENT"))
    ins = ins.withColumn("DPD", F.when(F.col("DAYS_ENTRY_PAYMENT") - F.col("DAYS_INSTALMENT") > 0, F.col("DAYS_ENTRY_PAYMENT") - F.col("DAYS_INSTALMENT")).otherwise(0))
    ins = ins.withColumn("DBD", F.when(F.col("DAYS_INSTALMENT") - F.col("DAYS_ENTRY_PAYMENT") > 0, F.col("DAYS_INSTALMENT") - F.col("DAYS_ENTRY_PAYMENT")).otherwise(0))
    agg_exprs = [
        F.countDistinct("NUM_INSTALMENT_VERSION").alias("INS_NUM_INSTALMENT_VERSION_NUNIQUE"),
        F.max("DPD").alias("INS_DPD_MAX"),
        F.mean("DPD").alias("INS_DPD_MEAN"),
        F.sum("DPD").alias("INS_DPD_SUM"),
        F.max("DBD").alias("INS_DBD_MAX"),
        F.mean("DBD").alias("INS_DBD_MEAN"),
        F.sum("DBD").alias("INS_DBD_SUM"),
        F.max("PAYMENT_PERC").alias("INS_PAYMENT_PERC_MAX"),
        F.mean("PAYMENT_PERC").alias("INS_PAYMENT_PERC_MEAN"),
        F.sum("PAYMENT_PERC").alias("INS_PAYMENT_PERC_SUM"),
        F.variance("PAYMENT_PERC").alias("INS_PAYMENT_PERC_VAR"),
        F.max("PAYMENT_DIFF").alias("INS_PAYMENT_DIFF_MAX"),
        F.mean("PAYMENT_DIFF").alias("INS_PAYMENT_DIFF_MEAN"),
        F.sum("PAYMENT_DIFF").alias("INS_PAYMENT_DIFF_SUM"),
        F.variance("PAYMENT_DIFF").alias("INS_PAYMENT_DIFF_VAR"),
        F.max("AMT_INSTALMENT").alias("INS_AMT_INSTALMENT_MAX"),
        F.mean("AMT_INSTALMENT").alias("INS_AMT_INSTALMENT_MEAN"),
        F.sum("AMT_INSTALMENT").alias("INS_AMT_INSTALMENT_SUM"),
        F.min("AMT_PAYMENT").alias("INS_AMT_PAYMENT_MIN"),
        F.max("AMT_PAYMENT").alias("INS_AMT_PAYMENT_MAX"),
        F.mean("AMT_PAYMENT").alias("INS_AMT_PAYMENT_MEAN"),
        F.sum("AMT_PAYMENT").alias("INS_AMT_PAYMENT_SUM"),
        F.max("DAYS_ENTRY_PAYMENT").alias("INS_DAYS_ENTRY_PAYMENT_MAX"),
        F.mean("DAYS_ENTRY_PAYMENT").alias("INS_DAYS_ENTRY_PAYMENT_MEAN"),
        F.sum("DAYS_ENTRY_PAYMENT").alias("INS_DAYS_ENTRY_PAYMENT_SUM"),
    ]
    # Remove categorical feature aggregation
    # cat_features = [c for c in ins.columns if c.startswith("NAME_CONTRACT_TYPE_vec")]
    # for cat in cat_features:
    #     agg_exprs.append(F.mean(cat).alias(f"INS_{cat}_MEAN"))
    ins_agg = ins.groupBy("SK_ID_CURR").agg(*agg_exprs)
    ins_agg = ins_agg.withColumn("INS_COUNT", F.lit(None))
    return ins_agg

In [0]:
def credit_card_balance(num_rows=None, nan_as_category=True):
    cc = spark.table("creditrisk_catalog.silver_creditrisk.credit_card_balance")
    if num_rows:
        cc = cc.limit(num_rows)
    # Remove Spark one-hot encoding
    # cc = one_hot_encode(cc, "NAME_CONTRACT_STATUS", "NAME_CONTRACT_STATUS")
    cc = cc.drop("SK_ID_PREV")
    # Only aggregate numeric columns
    num_cols = [c for c, t in cc.dtypes if c != "SK_ID_CURR" and t in ["int", "bigint", "double", "float", "decimal"]]
    agg_exprs = []
    for col in num_cols:
        agg_exprs.extend([
            F.min(col).alias(f"CC_{col}_MIN"),
            F.max(col).alias(f"CC_{col}_MAX"),
            F.mean(col).alias(f"CC_{col}_MEAN"),
            F.sum(col).alias(f"CC_{col}_SUM"),
            F.variance(col).alias(f"CC_{col}_VAR")
        ])
    cc_agg = cc.groupBy("SK_ID_CURR").agg(*agg_exprs)
    cc_count = cc.groupBy("SK_ID_CURR").count().withColumnRenamed("count", "CC_COUNT")
    cc_agg = cc_agg.join(cc_count, on="SK_ID_CURR", how="left")
    return cc_agg

In [0]:
# Cell 10: LightGBM Modeling with PySpark and MLflow Integration (signature only, no input_example)
# Purpose: Trains LightGBM model with KFold cross-validation, logs parameters, metrics, and model artifacts to MLflow for experiment tracking and reproducibility.
# MLflow integration added for tracking and model management. Additional metrics (accuracy, precision, recall, F1, log loss) are logged for each fold and overall.
import mlflow
import mlflow.lightgbm
from mlflow.models import infer_signature

def kfold_lightgbm_pyspark(df, num_folds, stratified=True, explain=False, submission_file_name='submission.csv'):
    """
    LightGBM GBDT with KFold or Stratified KFold using PySpark DataFrames
    Optimized for serverless compute (no caching/persistence)
    Logs parameters, metrics, and model artifacts to MLflow.
    """
    import gc
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import KFold, StratifiedKFold
    from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_curve, confusion_matrix, precision_recall_curve
    import lightgbm as lgb
    from pyspark.sql import functions as F
    from pyspark.sql.window import Window
    import matplotlib.pyplot as plt
    import seaborn as sns

    # Split train and test using PySpark
    train_df = df.filter(F.col('TARGET').isNotNull())
    test_df = df.filter(F.col('TARGET').isNull())

    train_count = train_df.count()
    test_count = test_df.count()
    print(f"Starting LightGBM. Train count: {train_count}, test count: {test_count}")

    # Add fold assignment in PySpark for stratified sampling
    if stratified:
        window_spec = Window.partitionBy('TARGET').orderBy(F.rand(seed=1001))
        train_df = train_df.withColumn('row_num', F.row_number().over(window_spec))
        train_df = train_df.withColumn('fold', (F.col('row_num') % num_folds).cast('int'))
    else:
        train_df = train_df.withColumn('rand', F.rand(seed=1001))
        train_df = train_df.withColumn('fold', (F.ntile(num_folds).over(Window.orderBy('rand')) - 1).cast('int'))

    train_pd = train_df.toPandas()
    test_pd = test_df.toPandas()

    for col in train_pd.select_dtypes(include=['object']).columns:
        if col not in ['fold']:
            train_pd[col] = train_pd[col].astype('category')
    for col in test_pd.select_dtypes(include=['object']).columns:
        test_pd[col] = test_pd[col].astype('category')

    del train_df, test_df, df
    gc.collect()

    oof_preds = np.zeros(train_pd.shape[0])
    sub_preds = np.zeros(test_pd.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_pd.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'fold', 'row_num', 'rand']]

    params = {
        'nthread': 4,
        'n_estimators': 10000,
        'learning_rate': 0.02,
        'num_leaves': 34,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'max_depth': 8,
        'reg_alpha': 0.041545473,
        'reg_lambda': 0.0735294,
        'min_split_gain': 0.0222415,
        'min_child_weight': 39.3259775,
        'verbose': -1
    }

    # Start MLflow run for experiment tracking
    with mlflow.start_run():
        mlflow.log_param("num_folds", num_folds)
        mlflow.log_param("stratified", stratified)
        for k, v in params.items():
            mlflow.log_param(k, v)

        model = None  # Minimal fix: ensure model is defined outside the loop
        for fold_num in range(num_folds):
            print(f'\n--- Fold {fold_num + 1} / {num_folds} ---')
            train_idx = train_pd['fold'] != fold_num
            valid_idx = train_pd['fold'] == fold_num
            train_x = train_pd.loc[train_idx, feats]
            train_y = train_pd.loc[train_idx, 'TARGET']
            valid_x = train_pd.loc[valid_idx, feats]
            valid_y = train_pd.loc[valid_idx, 'TARGET']
            model = lgb.LGBMClassifier(**params)
            model.fit(
                train_x, train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                callbacks=[lgb.early_stopping(100, verbose=False)]
            )
            oof_preds[valid_idx] = model.predict_proba(valid_x, num_iteration=model.best_iteration_)[:, 1]
            sub_preds += model.predict_proba(test_pd[feats], num_iteration=model.best_iteration_)[:, 1] / num_folds
            fold_importance_df = pd.DataFrame({
                'feature': feats,
                'importance': model.feature_importances_,
                'fold': fold_num + 1
            })
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
            fold_auc = roc_auc_score(valid_y, oof_preds[valid_idx])
            preds_binary = (oof_preds[valid_idx] > 0.5).astype(int)
            fold_acc = accuracy_score(valid_y, preds_binary)
            fold_prec = precision_score(valid_y, preds_binary)
            fold_rec = recall_score(valid_y, preds_binary)
            fold_f1 = f1_score(valid_y, preds_binary)
            fold_logloss = log_loss(valid_y, oof_preds[valid_idx])
            print(f'Fold {fold_num + 1} AUC : {fold_auc:.6f}, Accuracy: {fold_acc:.4f}, Precision: {fold_prec:.4f}, Recall: {fold_rec:.4f}, F1: {fold_f1:.4f}, LogLoss: {fold_logloss:.4f}')
            mlflow.log_metric(f"fold_{fold_num+1}_auc", fold_auc)
            mlflow.log_metric(f"fold_{fold_num+1}_accuracy", fold_acc)
            mlflow.log_metric(f"fold_{fold_num+1}_precision", fold_prec)
            mlflow.log_metric(f"fold_{fold_num+1}_recall", fold_rec)
            mlflow.log_metric(f"fold_{fold_num+1}_f1", fold_f1)
            mlflow.log_metric(f"fold_{fold_num+1}_logloss", fold_logloss)
            if explain:
                import shap
                explainer = shap.TreeExplainer(model)
                shap_values = explainer.shap_values(valid_x)
                shap.summary_plot(shap_values, valid_x, feature_names=feats, show=False)
            del train_x, train_y, valid_x, valid_y
            gc.collect()
        overall_auc = roc_auc_score(train_pd['TARGET'], oof_preds)
        overall_preds_binary = (oof_preds > 0.5).astype(int)
        overall_acc = accuracy_score(train_pd['TARGET'], overall_preds_binary)
        overall_prec = precision_score(train_pd['TARGET'], overall_preds_binary)
        overall_rec = recall_score(train_pd['TARGET'], overall_preds_binary)
        overall_f1 = f1_score(train_pd['TARGET'], overall_preds_binary)
        overall_logloss = log_loss(train_pd['TARGET'], oof_preds)
        print(f'\nFull AUC score: {overall_auc:.6f}, Accuracy: {overall_acc:.4f}, Precision: {overall_prec:.4f}, Recall: {overall_rec:.4f},            F1: {overall_f1:.4f}, LogLoss: {overall_logloss:.4f}')
        mlflow.log_metric("overall_auc", overall_auc)
        mlflow.log_metric("overall_accuracy", overall_acc)
        mlflow.log_metric("overall_precision", overall_prec)
        mlflow.log_metric("overall_recall", overall_rec)
        mlflow.log_metric("overall_f1", overall_f1)
        mlflow.log_metric("overall_logloss", overall_logloss)
        submission_df = test_pd[['SK_ID_CURR']].copy()
        submission_df['TARGET'] = sub_preds
        submission_df.to_csv(submission_file_name, index=False)
        print(f'Submission saved to {submission_file_name}')
        mlflow.log_artifact(submission_file_name)
        # Log the last trained model (minimal fix for UnboundLocalError)
        input_example = train_pd[feats].head(5)
        output_example = model.predict_proba(input_example)
        signature = infer_signature(input_example, output_example)
        mlflow.lightgbm.log_model(model, "model", signature=signature)
        # Log ROC curve
        fpr, tpr, _ = roc_curve(train_pd['TARGET'], oof_preds)
        plt.figure()
        plt.plot(fpr, tpr, label='ROC Curve')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend()
        mlflow.log_figure(plt.gcf(), "roc_curve.png")
        plt.close()
        # Log confusion matrix
        cm = confusion_matrix(train_pd['TARGET'], overall_preds_binary)
        plt.figure()
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        mlflow.log_figure(plt.gcf(), "confusion_matrix.png")
        plt.close()
        # Log Precision-Recall curve
        precision, recall, _ = precision_recall_curve(train_pd['TARGET'], oof_preds)
        plt.figure()
        plt.plot(recall, precision, label='Precision-Recall Curve')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend()
        mlflow.log_figure(plt.gcf(), "precision_recall_curve.png")
        plt.close()
        # Log histogram of predicted probabilities
        plt.figure()
        plt.hist(oof_preds, bins=50, color='skyblue', edgecolor='black')
        plt.title('Histogram of Predicted Probabilities')
        plt.xlabel('Predicted Probability')
        plt.ylabel('Frequency')
        mlflow.log_figure(plt.gcf(), "predicted_prob_histogram.png")
        plt.close()
        display_importances(feature_importance_df)
    return feature_importance_df

def display_importances(feature_importance_df):
    """
    Display and plot feature importances
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False
    )[:40].index
    best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(
        x="importance",
        y="feature",
        data=best_features.sort_values(by="importance", ascending=False),
        palette="viridis"
    )
    plt.title('LightGBM Feature Importance (Top 40)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')
    plt.show()
    # Log feature importance plot to MLflow
    mlflow.log_artifact('lgbm_importances.png')
    return cols

In [0]:
# NOTE: The kfold_lightgbm function below is NOT used in the main pipeline.
# Reason: Superseded by kfold_lightgbm_pyspark (cell 10), which is optimized for PySpark and serverless compute.
# This function is kept for reference only and is commented out to avoid confusion.
# If you need a Pandas-based implementation, you may refer to this code.

# def kfold_lightgbm(df, num_folds, stratified=True, explain=False):
#     import gc
#     import numpy as np
#     import pandas as pd
#     from sklearn.model_selection import KFold, StratifiedKFold
#     from sklearn.metrics import roc_auc_score
#     import lightgbm as lgb
#
#     train_df = df.filter(F.col('TARGET').isNotNull())
#     test_df = df.filter(F.col('TARGET').isNull())
#     print("Starting LightGBM. Train count: {}, test count: {}".format(train_df.count(), test_df.count()))
#     train_pd = train_df.toPandas()
#     test_pd = test_df.toPandas()
#     # Minimal fix: convert object columns to category dtype
#     for col in train_pd.select_dtypes(include=['object']).columns:
#         train_pd[col] = train_pd[col].astype('category')
#     for col in test_pd.select_dtypes(include=['object']).columns:
#         test_pd[col] = test_pd[col].astype('category')
#     del train_df, test_df, df
#     gc.collect()
#
#     if stratified:
#         folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
#     else:
#         folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
#
#     oof_preds = np.zeros(train_pd.shape[0])
#     sub_preds = np.zeros(test_pd.shape[0])
#     feature_importance_df = pd.DataFrame()
#     feats = [f for f in train_pd.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV']]
#
#     params = {
#         'nthread': 4,
#         'n_estimators': 10000,
#         'learning_rate': 0.02,
#         'num_leaves': 34,
#         'colsample_bytree': 0.9497036,
#         'subsample': 0.8715623,
#         'max_depth': 8,
#         'reg_alpha': 0.041545473,
#         'reg_lambda': 0.0735294,
#         'min_split_gain': 0.0222415,
#         'min_child_weight': 39.3259775,
#         'silent': -1
#     }
#
#     for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_pd[feats], train_pd['TARGET'])):
#         train_x, train_y = train_pd[feats].iloc[train_idx], train_pd['TARGET'].iloc[train_idx]
#         valid_x, valid_y = train_pd[feats].iloc[valid_idx], train_pd['TARGET'].iloc[valid_idx]
#
#         model = lgb.LGBMClassifier(**params)
#         model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
#                   eval_metric='auc',  callbacks=[lgb.early_stopping(100)]
#         )
#
#         oof_preds[valid_idx] = model.predict_proba(valid_x, num_iteration=model.best_iteration_)[:, 1]
#         sub_preds += model.predict_proba(test_pd[feats], num_iteration=model.best_iteration_)[:, 1] / folds.n_splits
#
#         fold_importance_df = pd.DataFrame()
#         fold_importance_df["feature"] = feats
#         fold_importance_df["importance"] = model.feature_importances_
#         fold_importance_df["fold"] = n_fold + 1
#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#         print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
#
#         if explain:
#             import shap
#             explainer = shap.TreeExplainer(model)
#             shap_values = explainer.shap_values(valid_x)
#             shap.summary_plot(shap_values, valid_x, feature_names=feats, show=False)
#
#         del model
#         del train_x, train_y, valid_x, valid_y
#         gc.collect()
#
#     print('Full AUC score %.6f' % roc_auc_score(train_pd['TARGET'], oof_preds))
#     test_pd['TARGET'] = sub_preds
#     test_pd[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False)
#     display_importances(feature_importance_df)
#     return feature_importance_df


In [0]:
def display_importances(feature_importance_df_):
    import matplotlib.pyplot as plt
    import seaborn as sns
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')


In [0]:
def main(debug = False):
    num_rows = 100000 if debug else None
    df = application_train_test(num_rows)
    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        print("Bureau df count:", bureau.count())
        df = df.join(bureau, how='left', on='SK_ID_CURR')
        del bureau
        gc.collect()
    with timer("Process previous_applications"):
        prev = previous_applications(num_rows)
        print("Previous applications df count:", prev.count())
        df = df.join(prev, how='left', on='SK_ID_CURR')
        del prev
        gc.collect()
    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        print("Pos-cash balance df count:", pos.count())
        df = df.join(pos, how='left', on='SK_ID_CURR')
        del pos
        gc.collect()
    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        print("Installments payments df count:", ins.count())
        df = df.join(ins, how='left', on='SK_ID_CURR')
        del ins
        gc.collect()
    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        print("Credit card balance df count:", cc.count())
        df = df.join(cc, how='left', on='SK_ID_CURR')
        del cc
        gc.collect()
    with timer("Run LightGBM with kfold"):
        feature_importance = kfold_lightgbm_pyspark(
            df=df,
            num_folds=5,
            stratified=True,
            explain=False,
            submission_file_name='submission.csv'
        )
if __name__ == "__main__":
    submission_file_name = "submission_kernel26.csv"
    with timer("Full model run"):
        main(debug= False)