# Credit Risk Score


- Binary classification

$$g(x_{i}) \approx y_{i}$$

$$y_{i} \in \{0, 1\}$$

- $1$: Default
- $0$: No default

**Dataset**:

[kaggle-credit-scoring](https://www.kaggle.com/datasets/nightcrawler101/creditscoring-csv?select=CreditScoring.csv) or [github-credit-scoring](https://github.com/gastonstat/CreditScoring)


Install packages


In [1]:
!uv pip install -q \
    python-dotenv==1.2.1 \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    seaborn==0.13.2 \
    scikit-learn==1.7.1 \
    tqdm==4.67.1

Append notebooks directory to sys.path


In [None]:
import sys

sys.path.append("../../..")

In [3]:
import os
import pathlib
import random
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
from typing import Tuple, Union
import numpy as np
from numpy.typing import NDArray
import seaborn as sns
import datetime
import pickle
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from notebooks.python.utils.data_extraction.data_extraction import (
    KaggleDataExtractor,
    KaggleExtractionConfig,
)

pd.set_option("display.max_columns", None)

sns.set_style("darkgrid")
sns.set_theme(style="darkgrid")

%matplotlib inline

load_dotenv()  # Root directory .env file

True

## Utility scripts:

**KaggleDataExtractor**:

```py
--8<-- "docs/notebooks/python/utils/data_extraction/data_extraction.py"
```


Create data directory


In [None]:
DATA_DIR = pathlib.Path("data/credit-risk-score")

os.makedirs(DATA_DIR, exist_ok=True)

Download dataset from Kaggle


In [None]:
username = os.getenv("KAGGLE_USERNAME")
api_token = os.getenv("KAGGLE_API_TOKEN")
file_name = "CreditScoring.csv"

extractor = KaggleDataExtractor(username=username, api_token=api_token)

config = KaggleExtractionConfig(
    dataset_slug="nightcrawler101/creditscoring-csv",
    file_name=file_name,
    destination_path=DATA_DIR,
    output_file_name="credit-scoring.csv",
)

if not os.path.isfile(DATA_DIR / "credit-scoring.csv"):
    extractor.download_dataset(config)

Pass notebook variables to **shell** command


In [6]:
!head $DATA_DIR/credit-scoring.csv

"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"
1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
1,0,1,36,26,1,1,1,46,107,0,0,310,910
1,1,2,60,36,2,1,1,75,214,3500,0,650,1645
1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
1,9,5,12,27,1,1,1,35,80,0,0,200,1093
1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957


## Data Preparation


Load dataset


In [None]:
df = pd.read_csv(DATA_DIR / "credit-scoring.csv")

df.head(n=2)

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658


Inspect all columns at once


In [None]:
df.head(3).T

Unnamed: 0,0,1,2
Status,1,1,2
Seniority,9,17,10
Home,1,1,2
Time,60,60,36
Age,30,58,46
Marital,2,3,2
Records,1,1,2
Job,3,1,3
Expenses,73,48,90
Income,129,131,200


Data summary


In [None]:
df_summary = pd.DataFrame(
    {
        "column": df.columns,
        "dtype": [df[col].dtype for col in df.columns],
        "sample_unique": [df[col].unique()[:6] for col in df.columns],
        "n_unique": [df[col].nunique() for col in df.columns],
    }
)
df_summary

Unnamed: 0,column,dtype,sample_unique,n_unique
0,Status,int64,"[1, 2, 0]",3
1,Seniority,int64,"[9, 17, 10, 0, 1, 29]",47
2,Home,int64,"[1, 2, 5, 3, 6, 4]",7
3,Time,int64,"[60, 36, 12, 48, 18, 24]",11
4,Age,int64,"[30, 58, 46, 24, 26, 36]",50
5,Marital,int64,"[2, 3, 1, 4, 5, 0]",6
6,Records,int64,"[1, 2]",2
7,Job,int64,"[3, 1, 2, 0, 4]",5
8,Expenses,int64,"[73, 48, 90, 63, 46, 75]",94
9,Income,int64,"[129, 131, 200, 182, 107, 214]",353


Clean column names


In [None]:
df.columns = df.columns.str.lower().str.replace(" ", "_")

df.head(n=2)

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658


Decode number variables


In [None]:
status_values = {
    1: "ok",
    2: "default",
    0: "unknown",
}
home_values = {
    1: "rent",
    2: "owner",
    3: "private",
    4: "ignore",
    5: "parents",
    6: "other",
    0: "unknown",
}
marital_values = {
    1: "single",
    2: "married",
    3: "widow",
    4: "separated",
    5: "divorced",
    0: "unknown",
}
records_values = {
    1: "no",
    2: "yes",
    0: "unknown",
}
job_values = {
    1: "fixed",
    2: "partime",
    3: "freelance",
    4: "others",
    0: "unknown",
}


df.status = df.status.map(status_values)
df.home = df.home.map(home_values)
df.marital = df.marital.map(marital_values)
df.job = df.job.map(job_values)
df.records = df.records.map(records_values)

Inspect decoding results


In [None]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


Inspect values range


In [None]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0
