In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Split dataset to train test

In [2]:
df_original = pd.read_csv('../datasets/loan_with_features.csv')
assert df_original.shape[0] > 1

In [3]:
# shuffle dataset
df = df_original.sample(frac=1, random_state=42)
assert not df.iloc[0].equals(df_original.iloc[0])

In [4]:
m = df.shape[0]
assert m > 1

train_df = df.iloc[:round(m * 0.8)]
test_df = df.iloc[round(m * 0.8):]
assert train_df.shape[0] + test_df.shape[0] == m

In [5]:
train_df.to_csv('../datasets/train.csv', index=False)
test_df.to_csv('../datasets/test.csv', index=False)

### Training

In [6]:
train_data = TabularDataset('../datasets/train.csv')
train_data.head()

Unnamed: 0,loan_amount,loan_duration,loan_payments,loan_status,account_frequency,disp_type,card_type,gender,age,district_name,...,avg_monthly_amount_p12m,avg_monthly_balance_p12m,trans_type_prijem_count,trans_type_vyber_count,trans_type_vydaj_count,trans_operation_prevod_na_ucet_count,trans_operation_prevod_z_uctu_count,trans_operation_vklad_count,trans_operation_vyber_count,trans_operation_vyber_kartou
0,417060,60,6951.0,C,POPLATEK MESICNE,DISPONENT,,M,37,Rokycany,...,82443.894737,455256.947368,82.0,,66.0,16.0,,44.0,50.0,
1,64860,60,1081.0,C,POPLATEK MESICNE,OWNER,,F,19,Louny,...,25879.25,298932.45,39.0,,94.0,43.0,,21.0,51.0,
2,128988,36,3583.0,D,POPLATEK TYDNE,OWNER,,M,57,Strakonice,...,34452.222222,158263.611111,35.0,3.0,44.0,,,17.0,47.0,
3,74124,36,2059.0,C,POPLATEK TYDNE,OWNER,,F,57,Pribram,...,61160.166667,368516.722222,39.0,,83.0,33.0,,24.0,50.0,
4,168984,24,7041.0,B,POPLATEK MESICNE,OWNER,,M,19,Hl.m. Praha,...,90859.7,448431.15,74.0,8.0,78.0,,,36.0,86.0,


In [7]:
hyperparameters = {
	'NN_TORCH': [{}],
	# 'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0, 'hyperparameter_tune_kwargs': None}}],
	'CAT': [{}],
	# 'XGB': [{}],
	'FASTAI': [{}],
	'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
	'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
	'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
predictor = (TabularPredictor(label='loan_status').fit(train_data, hyperparameters=hyperparameters))

No path specified. Models will be saved in: "AutogluonModels/ag-20250419_082259"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.0
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 24.3.0: Thu Jan  2 20:22:00 PST 2025; root:xnu-11215.81.4~3/RELEASE_X86_64
CPU Count:          8
Memory Avail:       5.80 GB / 16.00 GB (36.2%)
Disk Space Avail:   66.02 GB / 233.47 GB (28.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benc

### Prediction

In [8]:
test_data = TabularDataset('../datasets/test.csv')

y_pred = predictor.predict(test_data.drop(columns=['loan_status']))
y_pred.head()

Loaded data from: ../datasets/test.csv | Columns = 63 / 63 | Rows = 165 -> 165
If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


0    A
1    C
2    A
3    C
4    C
Name: loan_status, dtype: object

In [9]:
predictor.evaluate(test_data, silent=True)

If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


{'accuracy': 0.8909090909090909,
 'balanced_accuracy': 0.5453396422089707,
 'mcc': 0.7952546567341218}

In [10]:
predictor.leaderboard(test_data)

If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.915152,0.909774,accuracy,0.02818,0.00675,10.021138,0.02818,0.00675,10.021138,1,True,6
1,NeuralNetFastAI,0.890909,0.909774,accuracy,0.019128,0.014175,1.492366,0.019128,0.014175,1.492366,1,True,3
2,WeightedEnsemble_L2,0.890909,0.909774,accuracy,0.020787,0.015856,1.566555,0.001659,0.001681,0.074189,2,True,10
3,RandomForestEntr,0.884848,0.849624,accuracy,0.074504,0.054095,0.521017,0.074504,0.054095,0.521017,1,True,5
4,RandomForestGini,0.884848,0.87218,accuracy,0.079131,0.04898,0.621214,0.079131,0.04898,0.621214,1,True,4
5,ExtraTreesGini,0.872727,0.834586,accuracy,0.067207,0.057429,0.520666,0.067207,0.057429,0.520666,1,True,7
6,NeuralNetTorch,0.866667,0.879699,accuracy,0.031766,0.02711,6.468797,0.031766,0.02711,6.468797,1,True,9
7,ExtraTreesEntr,0.860606,0.827068,accuracy,0.065172,0.055493,0.515787,0.065172,0.055493,0.515787,1,True,8
8,KNeighborsDist,0.733333,0.661654,accuracy,0.002586,0.002605,0.016699,0.002586,0.002605,0.016699,1,True,2
9,KNeighborsUnif,0.690909,0.639098,accuracy,0.00348,0.077675,1.670568,0.00348,0.077675,1.670568,1,True,1
