# 1. Setup & Metadata

In [2]:
# phase: 02_data_understanding
# subphase: exploratory_analysis
# author: Franck Ngaha
# date: 2025-11-05

from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# 2. Load dataset

In [3]:
data = fetch_openml(name="credit-g", version=1, as_frame=True)
df = data.frame

print(f"Shape: {df.shape}")
df.head()

Shape: (1000, 21)


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


# 3. Dataset overview

In [4]:
df.info()
df.describe(include="all").T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         1000 non-null   category
 1   duration                1000 non-null   int64   
 2   credit_history          1000 non-null   category
 3   purpose                 1000 non-null   category
 4   credit_amount           1000 non-null   int64   
 5   savings_status          1000 non-null   category
 6   employment              1000 non-null   category
 7   installment_commitment  1000 non-null   int64   
 8   personal_status         1000 non-null   category
 9   other_parties           1000 non-null   category
 10  residence_since         1000 non-null   int64   
 11  property_magnitude      1000 non-null   category
 12  age                     1000 non-null   int64   
 13  other_payment_plans     1000 non-null   category
 14  housing                 1

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
checking_status,1000.0,4.0,no checking,394.0,,,,,,,
duration,1000.0,,,,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
credit_history,1000.0,5.0,existing paid,530.0,,,,,,,
purpose,1000.0,10.0,radio/tv,280.0,,,,,,,
credit_amount,1000.0,,,,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
savings_status,1000.0,5.0,<100,603.0,,,,,,,
employment,1000.0,5.0,1<=X<4,339.0,,,,,,,
installment_commitment,1000.0,,,,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
personal_status,1000.0,4.0,male single,548.0,,,,,,,
other_parties,1000.0,3.0,none,907.0,,,,,,,


# 4. Check missing values

In [5]:
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

Series([], dtype: int64)