In [1]:
import pandas as pd
data = pd.read_csv("/content/german_credit_risk.csv")
data

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Class
0,0,67,male,2,own,,little,1169,6,radio/TV,1
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,2
2,2,49,male,1,own,little,,2096,12,education,1
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,1
4,4,53,male,2,free,little,little,4870,24,car,2
...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,1
996,996,40,male,3,own,little,little,3857,30,car,1
997,997,38,male,2,own,little,,804,12,radio/TV,1
998,998,23,male,2,free,little,little,1845,45,radio/TV,2


# **Data Preporocessing**

In [2]:
cat_cols = data.select_dtypes(include=['object']).columns
num_cols = data.select_dtypes(include=['float', 'int']).columns

**Simple Imputing**

In [3]:
from sklearn.impute import SimpleImputer
# Impute missing values in categorical columns with most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])
imputed_cat_cols = data[cat_cols]

# Impute missing values in numerical columns with median value
num_imputer = SimpleImputer(strategy='median')
data[num_cols] = num_imputer.fit_transform(data[num_cols])
imputed_num_cols = data[num_cols]

**Label Encoding**

In [4]:
from sklearn.preprocessing import LabelEncoder

for col in cat_cols:
    data[col] = data[col].astype(str)

label_encoder = LabelEncoder()

for col in cat_cols:
    data[col] = label_encoder.fit_transform(data[col])

cat_encoded = data[cat_cols]
num_df = data[num_cols]

data_preprocessed = pd.concat([cat_encoded, num_df], axis=1)

In [5]:
X = data_preprocessed.drop(['Class'], axis=1)
y = data_preprocessed['Class']

**Data Splitting**

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Model Development & Evaluation**

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the classifier
classifier = RandomForestClassifier(n_estimators=200, max_depth= 10, min_samples_leaf=2, min_samples_split=5, random_state=42)
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:", report)
print(confusion_matrix(y_test,y_pred))

Accuracy: 0.715
Classification Report:               precision    recall  f1-score   support

         1.0       0.73      0.94      0.82       141
         2.0       0.56      0.17      0.26        59

    accuracy                           0.71       200
   macro avg       0.64      0.56      0.54       200
weighted avg       0.68      0.71      0.66       200

[[133   8]
 [ 49  10]]
