In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from ucimlrepo import fetch_ucirepo
from IPython.display import display

In [18]:
# https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data



# fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

# metadata
display(statlog_german_credit_data.metadata)
statlog_german_credit_data.variables

{'uci_id': 144,
 'name': 'Statlog (German Credit Data)',
 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data',
 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv',
 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix',
 'area': 'Social Science',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 1000,
 'num_features': 20,
 'feature_types': ['Categorical', 'Integer'],
 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'],
 'target_col': ['class'],
 'index_col': None,
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': 1994,
 'last_updated': 'Thu Aug 10 2023',
 'dataset_doi': '10.24432/C5NC77',
 'creators': ['Hans Hofmann'],
 'intro_paper': None,
 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Attribute1,Feature,Categorical,,Status of existing checking account,,no
1,Attribute2,Feature,Integer,,Duration,months,no
2,Attribute3,Feature,Categorical,,Credit history,,no
3,Attribute4,Feature,Categorical,,Purpose,,no
4,Attribute5,Feature,Integer,,Credit amount,,no
5,Attribute6,Feature,Categorical,,Savings account/bonds,,no
6,Attribute7,Feature,Categorical,Other,Present employment since,,no
7,Attribute8,Feature,Integer,,Installment rate in percentage of disposable i...,,no
8,Attribute9,Feature,Categorical,Marital Status,Personal status and sex,,no
9,Attribute10,Feature,Categorical,,Other debtors / guarantors,,no


In [19]:
df = pd.concat([X,y], axis=1)
df.sample(10)

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,...,Attribute12,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20,class
923,A12,12,A32,A40,2002,A61,A74,3,A93,A101,...,A122,30,A143,A151,1,A173,2,A192,A201,1
314,A13,4,A32,A40,1494,A65,A72,1,A93,A101,...,A121,29,A143,A152,1,A172,2,A191,A202,1
390,A14,18,A32,A40,1820,A61,A73,2,A94,A101,...,A122,30,A143,A152,1,A174,1,A192,A201,1
33,A14,12,A34,A49,1264,A65,A75,4,A93,A101,...,A124,57,A143,A151,1,A172,1,A191,A201,1
757,A13,15,A34,A43,1271,A65,A73,3,A93,A101,...,A124,39,A143,A153,2,A173,1,A192,A201,2
699,A13,15,A32,A46,1905,A61,A75,4,A93,A101,...,A123,40,A143,A151,1,A174,1,A192,A201,1
967,A14,15,A32,A43,3568,A61,A75,4,A92,A101,...,A123,54,A141,A151,1,A174,1,A192,A201,1
772,A14,21,A34,A41,3275,A61,A75,1,A93,A101,...,A123,36,A143,A152,1,A174,1,A192,A201,1
537,A12,18,A34,A42,3612,A61,A75,3,A92,A101,...,A122,37,A143,A152,1,A173,1,A192,A201,1
369,A12,18,A32,A42,3001,A61,A74,2,A92,A101,...,A121,40,A143,A151,1,A173,1,A191,A201,1


In [13]:
# generate dataset profiling report
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="German Credit Data Profiling Report", explorative=True)

In [14]:
profile.to_file("german_credit_data_profile_report.html")

100%|██████████| 21/21 [00:00<00:00, 43952.29it/s]0:00,  8.36it/s, Describe variable: class]     
Summarize dataset: 100%|██████████| 39/39 [00:04<00:00,  8.60it/s, Completed]                       
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.58s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 66.50it/s]


In [20]:
display(X)

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,Attribute11,Attribute12,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,4,A121,31,A143,A152,1,A172,1,A191,A201
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,4,A122,40,A143,A152,1,A174,1,A192,A201
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,4,A123,38,A143,A152,1,A173,1,A191,A201
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,4,A124,23,A143,A153,1,A173,1,A192,A201


In [21]:
display(y)

Unnamed: 0,class
0,1
1,2
2,1
3,1
4,2
...,...
995,1
996,1
997,1
998,2


In [32]:
import importlib
import utils
importlib.reload(utils)
from utils import *

target_col_name = y.columns[0]
describe_dataset(df, target_col_name)

📊 Shape of Dataset:
Rows: 1000, Columns: 21

🔎 Data types of columns:


Attribute1     object
Attribute2      int64
Attribute3     object
Attribute4     object
Attribute5      int64
Attribute6     object
Attribute7     object
Attribute8      int64
Attribute9     object
Attribute10    object
Attribute11     int64
Attribute12    object
Attribute13     int64
Attribute14    object
Attribute15    object
Attribute16     int64
Attribute17    object
Attribute18     int64
Attribute19    object
Attribute20    object
class           int64
dtype: object


⚠️ Missing values per column:


Unnamed: 0,missing_count,missing_percent



📑 Duplicate rows count:
0

📈 Dataset summary:


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Attribute1,1000.0,4.0,A14,394.0,,,,,,,
Attribute2,1000.0,,,,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
Attribute3,1000.0,5.0,A32,530.0,,,,,,,
Attribute4,1000.0,10.0,A43,280.0,,,,,,,
Attribute5,1000.0,,,,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
Attribute6,1000.0,5.0,A61,603.0,,,,,,,
Attribute7,1000.0,5.0,A73,339.0,,,,,,,
Attribute8,1000.0,,,,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
Attribute9,1000.0,4.0,A93,548.0,,,,,,,
Attribute10,1000.0,3.0,A101,907.0,,,,,,,



🎯 Target variable info: class


count    1000.000000
mean        1.300000
std         0.458487
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         2.000000
Name: class, dtype: float64

Unnamed: 0_level_0,proportion
class,Unnamed: 1_level_1
1,0.7
2,0.3
