# Load Dataset

In [1]:
from ucimlrepo import fetch_ucirepo

# fetch dataset 
df_taiwan_credit = fetch_ucirepo(id=350) 

df_credit = df_taiwan_credit.data.original.drop(columns='ID').copy()
df_credit.columns = df_taiwan_credit.variables['description'].values[1:]

In [2]:
df_credit.rename(columns={'default payment next month': 'IsDefault'}, inplace=True)
df_credit

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,IsDefault
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


# Train Simple Model

In [3]:
# 1. define model pipeline

from src.model_pipeline import ModelPipeline

pipeline = ModelPipeline(
    model_type='random_forest',  # choose among lr/rf/gbm/xgb/svm
    random_state=42
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 2. Prepare dataset

X, y, num_cols, cat_cols = pipeline.prepare_data(
    df=df_credit,
    target_column='IsDefault',
    exclude_columns=None,      # e.g. drop IDs or timestamps if present
    categorical_columns=None   # auto-detect cats if None
)

In [5]:
# 3. Split data train and test

splits = pipeline.split_data(X, y, test_size=0.3)

In [6]:
# 4. Build preprocessing + model pipeline

pipeline.build_pipeline(
    numerical_columns=num_cols,
    categorical_columns=cat_cols
)

In [7]:
# 5. Train on the training fold

pipeline.train(splits['X_train'], splits['y_train'])

In [8]:
# 6. Evaluate on the test fold
metrics = pipeline.evaluate(splits['X_test'], splits['y_test'])
print(
    f"Test set metrics -> Accuracy: {metrics['accuracy']:.4f}, "
    f"AUC: {metrics['auc']:.4f}"
)

Accuracy: 0.8149
AUC: 0.7593
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      7009
           1       0.64      0.37      0.47      1991

    accuracy                           0.81      9000
   macro avg       0.74      0.66      0.68      9000
weighted avg       0.80      0.81      0.80      9000

Test set metrics -> Accuracy: 0.8149, AUC: 0.7593


In [9]:
# 7. Perform 5-fold stratified cross-validation
cv_scores = pipeline.cross_validate(X, y, cv=5)
print(f"CV ROC AUC scores: {cv_scores}") 
print(f"Mean CV AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 8. Inspect the top 10 features driving model predictions
fi = pipeline.get_feature_importance(X, y)
print(fi.head(10))

CV folds:  20%|██        | 1/5 [00:03<00:14,  3.66s/it]

Fold 1/5 ROC AUC: 0.7681


CV folds:  40%|████      | 2/5 [00:07<00:10,  3.63s/it]

Fold 2/5 ROC AUC: 0.7611


CV folds:  60%|██████    | 3/5 [00:10<00:07,  3.67s/it]

Fold 3/5 ROC AUC: 0.7661


CV folds:  80%|████████  | 4/5 [00:14<00:03,  3.66s/it]

Fold 4/5 ROC AUC: 0.7703


CV folds: 100%|██████████| 5/5 [00:18<00:00,  3.64s/it]

Fold 5/5 ROC AUC: 0.7594
CV ROC AUC scores: [0.76805263 0.76113376 0.76606944 0.77030573 0.75943212]
Mean CV AUC: 0.7650 ± 0.0041
           feature  importance
2       num__PAY_0    0.099956
1         num__AGE    0.064758
0   num__LIMIT_BAL    0.059330
8   num__BILL_AMT1    0.057924
9   num__BILL_AMT2    0.053264
10  num__BILL_AMT3    0.050530
14   num__PAY_AMT1    0.050205
11  num__BILL_AMT4    0.049206
13  num__BILL_AMT6    0.048536
12  num__BILL_AMT5    0.048023





In [10]:
# 9. Get the trained model

clf = pipeline.get_model()

# Model Diagnostic

In [11]:
from src.model_diagnostic import TestSuite

ts = TestSuite()

In [12]:
splits

{'X_train':        LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
 11018     160000    2          2         2   32      0      0      0      0   
 1710      100000    2          1         2   30      0      0      0      0   
 4618       80000    1          1         2   29      0      0      0      0   
 5482       20000    2          2         1   27      0      0     -1     -1   
 26187      50000    2          1         2   30     -2     -2     -2     -2   
 ...          ...  ...        ...       ...  ...    ...    ...    ...    ...   
 25780     200000    2          2         1   32      0      0     -2     -2   
 13921     120000    2          2         2   24      0      0      0      0   
 3794      120000    2          1         2   24      0      0      0      0   
 27565     360000    1          1         1   57      1     -2     -1     -1   
 27126     300000    1          1         1   35     -1     -1     -1     -1   
 
        PAY_5  ...  BILL_AM

In [13]:
ts.set_data(
    data=splits,
    target_col='IsDefault'
)

ValueError: Data dictionary must contain 'train' and 'test' keys

In [14]:
ts.set_model(model=clf)

In [15]:
result = ts.diagnose_residual_analysis(
    features='EDUCATION',
    use_prediction=True,
    dataset='test',
    random_state=123
)



ValueError: X has 23 features, but RandomForestClassifier is expecting 33 features as input.