The dataset does not contain missing values, but we still need to perform feature encoding and EDA to understand the features and prepare them for modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
from imblearn.over_sampling import SMOTE

print("Original class distribution:", y_train.value_counts())

sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

print("✅ After SMOTE, class distribution:", y_train.value_counts())


In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

# ROC-AUC Score
y_prob = model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_prob)
print("✅ ROC-AUC Score:", roc_auc)

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC={roc_auc:.2f})")
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [20]:
var = "german.data"
columns = [
    'Status', 'Duration', 'CreditHistory', 'Purpose', 'CreditAmount',
    'Savings', 'EmploymentSince', 'InstallmentRate', 'PersonalStatusSex',

    
    'OtherDebtors', 'ResidenceSince', 'Property', 'Age',
    'OtherInstallmentPlans', 'Housing', 'NumberExistingCredits', 'Job',
    'LiablePeople', 'Telephone', 'ForeignWorker', 'CreditRisk'
]


In [19]:
# Load the data
df = pd.read_csv(var, sep=r'\s+', header=None, names=columns)

df.head()

  df = pd.read_csv(var, delim_whitespace=True, header=None, names=columns)


Unnamed: 0,Status,Duration,CreditHistory,Purpose,CreditAmount,Savings,EmploymentSince,InstallmentRate,PersonalStatusSex,OtherDebtors,...,Property,Age,OtherInstallmentPlans,Housing,NumberExistingCredits,Job,LiablePeople,Telephone,ForeignWorker,CreditRisk
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [None]:
# Check structure
df.info()

# Check unique values per column
print("\nUnique values per column:")
print(df.nunique())


In [None]:
plt.figure(figsize=(5,4))
sns.countplot(data=df, x='CreditRisk')
plt.title('Target Variable Distribution')
plt.xticks([0, 1], ['Good Credit (1)', 'Bad Credit (2)'])
plt.show()


In [None]:
# Plot histograms for numeric columns
num_cols = ['Duration', 'CreditAmount', 'Age']
df[num_cols].hist(figsize=(10, 5), bins=20, color='skyblue')
plt.suptitle("Distribution of Numerical Features")
plt.show()


In [None]:
# Value counts for some key categorical features
cat_cols = ['Status', 'CreditHistory', 'Purpose', 'PersonalStatusSex', 'Savings']

for col in cat_cols:
    print(f"\n{col} value counts:")
    print(df[col].value_counts())


In [None]:
#correlation to show dependency on each other
from sklearn.preprocessing import LabelEncoder
df_encoded = df.copy()
for col in df.columns:
    if df[col].dtype == 'object':
        df_encoded[col] = LabelEncoder().fit_transform(df[col])

# Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df_encoded.corr(), cmap='coolwarm', annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:

label_encoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        
        df[col] = label_encoder.fit_transform(df[col])

df['CreditRisk'] = df['CreditRisk'].map({1: 0, 2: 1})

num_cols = ['Duration', 'CreditAmount', 'Age']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

df.head()


In [None]:
X = df.drop('CreditRisk', axis=1)
y = df['CreditRisk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from imblearn.over_sampling import SMOTE

print("Original class distribution:", y_train.value_counts())

sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

print("✅ After SMOTE, class distribution:", y_train.value_counts())


In [None]:
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

In [None]:
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

# ROC-AUC Score
y_prob = model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_prob)
print("✅ ROC-AUC Score:", roc_auc)

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC={roc_auc:.2f})")
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12,6))
sns.barplot(x=importances[indices], y=X.columns[indices], palette="mako")
plt.title("Feature Importances from Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
