In [1]:
import streamlit as st
import pandas as pd
import numpy as np

In [2]:
file_path = 'german_credit_data.csv'
data = pd.read_csv(file_path)

In [19]:
data[20:]

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
20,48,male,2,own,little,unknown,2134,9,car
21,44,male,2,rent,quite rich,little,2647,6,radio/TV
22,48,male,1,rent,little,little,2241,10,car
23,44,male,2,own,moderate,moderate,1804,12,car
24,26,male,2,own,unknown,unknown,2069,10,furniture/equipment
...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,unknown,1736,12,furniture/equipment
996,40,male,3,own,little,little,3857,30,car
997,38,male,2,own,little,unknown,804,12,radio/TV
998,23,male,2,free,little,little,1845,45,radio/TV


In [4]:
data.drop(["Unnamed: 0"], axis=1,inplace= True)

In [5]:
data.isnull().sum()

Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
dtypes: int64(4), object(5)
memory usage: 70.4+ KB


In [7]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

data['Saving accounts'].fillna('unknown', inplace=True)
data['Checking account'].fillna('unknown', inplace=True)

categorical_cols = ['Job', 'Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
data_encoded = data.copy()

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data_encoded[col] = le.fit_transform(data_encoded[col])
    label_encoders[col] = le  # Save encoders for later interpretation

scaler = StandardScaler()
numerical_cols = ['Age', 'Credit amount', 'Duration']
data_encoded[numerical_cols] = scaler.fit_transform(data_encoded[numerical_cols])

data_encoded.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,2.766456,1,2,1,4,0,-0.745131,-1.236478,5
1,-1.191404,0,2,1,0,1,0.949817,2.248194,5
2,1.183312,1,1,1,0,3,-0.416562,-0.738668,3
3,0.831502,1,2,0,0,0,1.634247,1.750384,4
4,1.535122,1,2,0,0,0,0.566664,0.256953,1


In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

df_original = pd.read_csv(file_path)
if 'Risk' in df_original.columns:
    target_col = 'Risk'
    risk_dist = df_original['Risk'].value_counts()
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df_original, x='Risk')
    plt.title('Distribution of Credit Risk')
    plt.xlabel('Credit Risk')
    plt.ylabel('Count')
    plt.show()
else:
    target_col = None

target_col

In [9]:
print(df_original.columns)
print(df_original.head())

Index(['Unnamed: 0', 'Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
       'Checking account', 'Credit amount', 'Duration', 'Purpose'],
      dtype='object')
   Unnamed: 0  Age     Sex  Job Housing Saving accounts Checking account  \
0           0   67    male    2     own             NaN           little   
1           1   22  female    2     own          little         moderate   
2           2   49    male    1     own          little              NaN   
3           3   45    male    2    free          little           little   
4           4   53    male    2    free          little           little   

   Credit amount  Duration              Purpose  
0           1169         6             radio/TV  
1           5951        48             radio/TV  
2           2096        12            education  
3           7882        42  furniture/equipment  
4           4870        24                  car  


In [10]:
credit_threshold = data_encoded['Credit amount'].median()
duration_threshold = data_encoded['Duration'].median()

data_encoded['Risk'] = (
    (data_encoded['Credit amount'] > credit_threshold) &
    (data_encoded['Duration'] < duration_threshold)
).astype(int)

data_encoded['Risk'].value_counts()
data_encoded.head(50)

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,2.766456,1,2,1,4,0,-0.745131,-1.236478,5,0
1,-1.191404,0,2,1,0,1,0.949817,2.248194,5,0
2,1.183312,1,1,1,0,3,-0.416562,-0.738668,3,0
3,0.831502,1,2,0,0,0,1.634247,1.750384,4,0
4,1.535122,1,2,0,0,0,0.566664,0.256953,1,0
5,-0.048022,1,1,0,4,3,2.050009,1.252574,3,0
6,1.535122,1,2,1,2,3,-0.154629,0.256953,4,0
7,-0.048022,1,3,2,0,1,1.303197,1.252574,1,0
8,2.238742,1,1,1,3,3,-0.075233,-0.738668,5,1
9,-0.663689,1,3,1,0,1,0.695681,0.754763,1,0


In [11]:
from sklearn.model_selection import train_test_split

X = data_encoded.drop(columns=['Risk'])
y = data_encoded['Risk']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       180
           1       1.00      1.00      1.00        20

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[180   0]
 [  0  20]]


In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

importances = model.feature_importances_
features = X.columns

sns.barplot(x=importances, y=features)
plt.title("Feature Importance - Random Forest")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

  plt.show()


In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
 
svm = SVC(probability=True, random_state=42)

In [16]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Grid Search
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
grid_search_svm.fit(X_train, y_train)
print("Best hyperparams:", grid_search_svm.best_params_)

best_svm = grid_search_svm.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best hyperparams: {'C': 10, 'gamma': 'auto', 'kernel': 'poly'}


In [17]:
y_pred_svm = best_svm.predict(X_test)

print(confusion_matrix(y_test, y_pred_svm))

print("Support Vector Machine Classification Report:")
print(classification_report(y_test, y_pred_svm))

[[178   2]
 [  6  14]]
Support Vector Machine Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       180
           1       0.88      0.70      0.78        20

    accuracy                           0.96       200
   macro avg       0.92      0.84      0.88       200
weighted avg       0.96      0.96      0.96       200



In [18]:
from sklearn.inspection import permutation_importance

# svm.fit(X, y)
result = permutation_importance(best_svm, X_test, y_test, n_repeats=10, random_state=42)
importances = result.importances_mean

# Create a DataFrame for better visualization
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df = importance_df.sort_values('Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title("Feature Importance - SVM")
plt.xlabel("Mean Decrease in Accuracy")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

  plt.show()
