In [197]:
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
import pandas as pd

### Split dataset to train test

In [None]:
df_original = pd.read_csv('../datasets/loan_with_features.csv')
# Assert that the DataFrame has more than 1 row.
assert df_original.shape[0] > 1

In [None]:
# Shuffle dataset
df = df_original.sample(frac=1, random_state=42)
# Assert that the DataFrame after shuffle, have the same rows as original
assert not df.iloc[0].equals(df_original.iloc[0])

In [None]:
# Get dataset length
m = df.shape[0]
# Assert dataset length more than 1
assert m > 1

split_ratio = 0.7
# First 70% for training dataset
train_df = df.iloc[:round(m * split_ratio)]
# Last 30% for test dataset
test_df = df.iloc[round(m * split_ratio):]
# Assert that the sum of training and test dataset length equals to original dataset
assert train_df.shape[0] + test_df.shape[0] == m

In [None]:
# Export training and test datasets
train_df.to_csv('../datasets/train.csv', index=False)
test_df.to_csv('../datasets/test.csv', index=False)

### Training

In [None]:
# Read training dataset using TabularDataset class
train_data = TabularDataset('../datasets/train.csv')
train_data.head()

Loaded data from: ../datasets/train.csv | Columns = 14 / 14 | Rows = 202 -> 202


Unnamed: 0,loan_duration,loan_amount,loan_payments,days_between,account_frequency,avg_order_amount,avg_trans_amount,avg_trans_balance,n_trans,card_type,avg_salary,gender,age,loan_status
0,36,299088,8308.0,419 days,POPLATEK MESICNE,4842.6,12746.619048,51992.944444,126,,8110,F,28,B
1,12,30276,2523.0,498 days,POPLATEK MESICNE,2653.55,12061.10303,61382.915152,165,,9893,M,35,A
2,48,154416,3217.0,176 days,POPLATEK TYDNE,3216.7,9740.15625,46716.15625,32,,8968,F,24,A
3,12,92400,7700.0,459 days,POPLATEK MESICNE,4137.933333,9193.298077,42291.298077,104,classic,10177,F,23,A
4,12,42900,3575.0,646 days,POPLATEK MESICNE,5264.666667,11403.582857,50590.622857,350,,8541,F,37,A


In [None]:
MODEL_SAVE_PATH = 'AutogluonModels/final_deployment_optimized'

# Define hyperparameters for AutoGluon TabularPredictor.
hyperparameters = {
    # Neural Network (Torch backend) - default configuration
    'NN_TORCH': [{}],

    # CatBoost - default configuration
    'CAT': [{}],

    # FastAI Neural Network - default configuration
    'FASTAI': [{}],

    # Random Forest configurations
    # 'ag_args' is used for AutoGluon specific arguments:
    #   'name_suffix': Appends a suffix to the model name for easier identification.
    #   'problem_types': Restricts this model configuration to specific problem types.
    'RF': [
        {'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}},
        {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}},
        # 'squared_error' is for regression, ensure 'loan_status' is not regression if this is active
        {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}
    ],

    # Extra Trees configurations (similar structure to Random Forest)
    'XT': [
        {'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}},
        {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}},
        {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}
    ],

    # K-Nearest Neighbors configurations
    'KNN': [
        {'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}},
        {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}
    ],
}

# Initialize and train the TabularPredictor
predictor = (
  TabularPredictor(
    label='loan_status',      # Target variable to predict
    path=MODEL_SAVE_PATH      # Directory to save trained models
  )
  .fit(
    train_data,
    hyperparameters=hyperparameters,
    presets='optimize_for_deployment', # Optimizes for inference speed and deployment ease
  )
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.0
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 24.3.0: Thu Jan  2 20:22:00 PST 2025; root:xnu-11215.81.4~3/RELEASE_X86_64
CPU Count:          8
Memory Avail:       5.31 GB / 16.00 GB (33.2%)
Disk Space Avail:   48.90 GB / 233.47 GB (20.9%)
Presets specified: ['optimize_for_deployment']
Beginning AutoGluon training ...
AutoGluon will save models to "/Users/vincentcheng/Documents/data_science/loan-prediction-app-aws/jupyter-nb/AutogluonModels/final"
Train Data Rows:    202
Train Data Columns: 13
Label Column:       loan_status
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['B', 'A']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression',

### Prediction

In [None]:
# Read test dataset using TabularDataset class
test_data = TabularDataset('../datasets/test.csv')

# Predict test dataset
y_pred = predictor.predict(test_data.drop(columns=['loan_status']))
# Preview predictions
y_pred.head()

Loaded data from: ../datasets/test.csv | Columns = 14 / 14 | Rows = 87 -> 87
If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


0    A
1    A
2    A
3    A
4    A
Name: loan_status, dtype: object

In [224]:
predictor.evaluate(test_data, silent=True)

If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


{'accuracy': 0.8850574712643678,
 'balanced_accuracy': 0.5435064935064935,
 'mcc': 0.185184647595632,
 'roc_auc': 0.5701298701298704,
 'f1': 0.16666666666666666,
 'precision': 0.5,
 'recall': 0.1}

In [225]:
predictor.leaderboard(test_data)

If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI,0.885057,0.902439,accuracy,0.126467,0.011165,0.93446,0.126467,0.011165,0.93446,1,True,1
1,WeightedEnsemble_L2,0.885057,0.902439,accuracy,0.128661,0.011901,1.004319,0.002194,0.000736,0.069859,2,True,2


In [226]:
y_pred[y_pred == "B"]

37    B
44    B
Name: loan_status, dtype: object

In [232]:
# Sample test case for loan that will not be paid off
test_data.iloc[37]

loan_duration                      24
loan_amount                    189696
loan_payments                  7904.0
days_between                 134 days
account_frequency    POPLATEK MESICNE
avg_order_amount               3861.6
avg_trans_amount               4500.0
avg_trans_balance             14900.0
n_trans                             5
card_type                         NaN
avg_salary                       8441
gender                              F
age                                24
loan_status                         B
Name: 37, dtype: object

POPLATEK MESICNE: Monthly Issuance

In [228]:
from datetime import timedelta

pd.to_datetime('2025-04-26') - timedelta(134)

Timestamp('2024-12-13 00:00:00')