# German Credit Data Set

https://www.openml.org/search?type=data&sort=runs&status=active&id=31

## Data Dictionary

| Column Name              | Non-Null Count | Dtype   |
|--------------------------|----------------|---------|
| checking_status          | 1000 non-null  |  object |
| duration                 | 1000 non-null  |  float64|
| credit_history           | 1000 non-null  |  object |
| purpose                  | 1000 non-null  |  object |
| credit_amount            | 1000 non-null  |  float64|
| savings_status           | 1000 non-null  |  object |
| employment               | 1000 non-null  |  object |
| installment_commitment   | 1000 non-null  |  float64|
| personal_status          | 1000 non-null  |  object |
| other_parties            | 1000 non-null  |  object |
| residence_since          | 1000 non-null  |  float64|
| property_magnitude       | 1000 non-null  |  object |
| age                      | 1000 non-null  |  float64|
| other_payment_plans      | 1000 non-null  |  object |
| housing                  | 1000 non-null  |  object |
| existing_credits         | 1000 non-null  |  float64|
| job                      | 1000 non-null  |  object |
| num_dependents           | 1000 non-null  |  float64|
| own_telephone            | 1000 non-null  |  object |
| foreign_worker           | 1000 non-null  |  object |
| class                    | 1000 non-null  |  object |

The column `class` is the target variable.

## Exploratory Data Analysis

In [1]:
import pandas as pd 
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff

In [None]:
# Environmental variables
DATA_PATH = "./data/dataset_31_credit-g.arff"
PLOT_PATH = "./plots/"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

: 

In [None]:
# Read data file
try:
    # Replace 'your_data.arff' with your file path
    arff_file_data, meta = arff.loadarff(DATA_PATH)
    df = pd.DataFrame(arff_file_data)

    # Decode byte strings to regular strings
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = df[col].str.decode('utf-8')
            except AttributeError:
                pass

except ImportError:
    raise ImportError("scipy is required to read .arff files. Please install scipy and try again.") 

finally:
    print(f"Data loaded successfully. \nDataFrame shape: {df.shape[0]:,} rows and {df.shape[1]:,} columns.")

: 

In [None]:
# Peek at data
display(df.head(10))

: 

In [None]:
# Checking dataset types
print(df.info())

: 

In [None]:
# Checking for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

: 

In [None]:
# Identify columns with missing data
missing_counts = df.isnull().sum()
missing_percent = df.isnull().mean() * 100

if missing_counts.sum() > 0:
    # When there are missing values

    # Filter columns with missing values
    missing_columns = missing_counts[missing_counts > 0].index.tolist()

    # Display columns with missing values, their count, and percentage
    print("Columns with missing data:\n")
    for col in missing_columns:
        print(f"{col:<15} : {missing_counts[col]:>6,} missing ({missing_percent[col]:>5.2f}%)")
else:
    # When there are no missing values
    print('There are no missing values in the dataset.')

: 

In [None]:
# Checking statistical summary of numerical columns
display(df.describe(include=[np.number]))

: 

In [None]:
# Checking categorical columns summary
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
display(df[categorical_cols].describe())

: 

In [None]:
# Get list of object-type columns
object_cols = df.select_dtypes(include=['object']).columns.tolist()

# Iterate through each object column and display value counts
for col in object_cols:
    print(f"--- Value counts for: {col} ---")
    print(df[col].value_counts())
    print()

: 

In [None]:
# Analysis of target column distribution
target_column = 'class'

### Checking target `label` column  # Set the aesthetic style of the plots
sns.set_style("darkgrid")

# Bar chart for price category with annotations
plt.figure(figsize=(8, 5))
count_plot = sns.countplot(x=target_column, data=df, hue=target_column, palette='colorblind', legend=False)
plt.title('Distribution of Good and Bad Credit (Target)')
plt.xlabel('Good/Bad Credit')
plt.ylabel('Count')
# plt.xticks(rotation=45)

# Annotate the bars with the frequency count
for p in count_plot.patches:
    count_plot.annotate(format(p.get_height(), ',.0f'),
                        (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha = 'center', va = 'center',
                        xytext = (0, 9),
                        textcoords = 'offset points')
plt.tight_layout()
plt.savefig(PLOT_PATH + 'barchart_label.png')
plt.show()

: 

The target variable `class` has two categories: 'good' and 'bad' credit risk. The dataset is balanced with 700 instances of 'good' credit risk and 300 instances of 'bad' credit risk.

## Univariate Analysis

In [None]:
# Separate numerical and categorical columns into 2 lists
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

: 

In [None]:
print(numerical_cols)
print(f'Number of numerical columns: {len(numerical_cols)}')

: 

In [None]:
categorical_cols.pop()  # Remove target column from categorical columns list
print(categorical_cols)
print(f'Number of categorical columns: {len(categorical_cols)}')

: 

### Bar Plots for Numerical Features

In [None]:
# Create histograms with KDE for all numerical columns
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    ax = axes[idx]
    
    # Create histogram with KDE
    sns.histplot(data=df, x=col, kde=True, ax=ax, color='steelblue', edgecolor='black', alpha=0.7)
    
    # Annotate bars with frequency count
    for patch in ax.patches:
        height = patch.get_height()
        if height > 0:  # Only annotate bars with values
            ax.annotate(f'{int(height)}',
                       xy=(patch.get_x() + patch.get_width() / 2, height),
                       ha='center', va='bottom',
                       fontsize=8,
                       xytext=(0, 2),
                       textcoords='offset points')
    
    ax.set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    ax.set_xlabel(col, fontsize=10)
    ax.set_ylabel('Frequency', fontsize=10)
    ax.grid(axis='y', alpha=0.3)

# Remove extra empty subplots if any
for idx in range(len(numerical_cols), len(axes)):
    fig.delaxes(axes[idx])

plt.suptitle('Univariate Analysis: Numerical Variables', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig(PLOT_PATH + 'histograms_numerical.png', dpi=300, bbox_inches='tight')
plt.show()

: 

**Analysis of Numerical Features:**

- `duration` - Most credit durations were between 12-24 months. A small number of credits had durations longer than 48 months, even with a maximum of 72 months.
- `credit_amount` - Right-skewed distribution, with most credits under Deutsche Mark 5,000.
- `installment_commitment` - Most creditors committed to 4 installments, but there were peaks at 2 and 3 as well.
- `residence_since` - Most of the applicants have lived at their residence for 4 years.
- `age` - Distribution peaks around 25-35 years old
- `existing_credits` - Most people have 1 existing credit with the bank. There are a few with 2 or more.
- `num_dependents` - Vast majority have 1 dependent, some have 2. It would mean that these are either young families or with limited responsibilities.


### Pie Charts for Categorical Features

In [None]:
categorical_cols

: 

In [None]:
features_for_pie = ['checking_status', 'credit_history', 'savings_status', 'employment',
                    'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans',
                    'housing', 'job', 'own_telephone', 'foreign_worker']

: 

In [None]:
# Create pie charts for features in features_for_pie list
fig, axes = plt.subplots(4, 3, figsize=(15, 16))
fig.suptitle('Distribution of Categorical Features', fontsize=16, fontweight='bold', y=0.995)

# Flatten axes array for easier iteration
axes = axes.flatten()

for idx, feature in enumerate(features_for_pie):
    # Get value counts and sort in descending order
    value_counts = df[feature].value_counts().sort_values(ascending=False)
    
    # Create pie chart
    wedges, texts, autotexts = axes[idx].pie(
        value_counts.values, 
        labels=value_counts.index,
        autopct='%.1f%%',  # Show percentage values
        startangle=90,
        textprops={'fontsize': 9}
    )
    
    # Format the annotations to show percentages
    for autotext in autotexts:
        autotext.set_fontsize(8)
        autotext.set_fontweight('bold')
        autotext.set_color('white')
    
    # Set title for each subplot
    axes[idx].set_title(feature.replace('_', ' ').title(), fontsize=10, fontweight='bold')

plt.tight_layout()

# Save the figure
plt.savefig(f"{PLOT_PATH}categorical_features_pie_charts.png", dpi=300, bbox_inches='tight')
plt.show()

: 

In [None]:
# Create horizontal bar chart for 'purpose' column using seaborn
plt.figure(figsize=(10, 8))

# Get value counts and sort in descending order
purpose_counts = df['purpose'].value_counts().sort_values(ascending=False)

# Create horizontal bar chart with seaborn
ax = sns.barplot(x=purpose_counts.values, y=purpose_counts.index, color='steelblue', edgecolor='black', alpha=0.8)

# Annotate bars with values
for i, value in enumerate(purpose_counts.values):
    ax.text(value + 5, i, str(value), va='center', fontweight='bold', fontsize=10)

# Set labels and title
plt.xlabel('Count', fontsize=12, fontweight='bold')
plt.ylabel('Purpose', fontsize=12, fontweight='bold')
plt.title('Distribution of Credit Purpose', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()

# Save the figure
plt.savefig(f"{PLOT_PATH}horizontal_bar_purpose.png", dpi=300, bbox_inches='tight')
plt.show()

: 

## Bivariate Analysis