Initialization

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB

#load the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
columns = ["checking_account_status", "duration", "credit_history", "purpose", "credit_amount",
           "savings_account_status", "employment_status", "installment_rate", "personal_status",
           "other_debtors", "residence_history", "property", "age", "other_installment_plans",
           "housing", "number_of_existing_credits", "job", "dependents", "telephone", "foreign_worker", "class"]
data = pd.read_csv(url, sep=" ", header=None, names=columns)

#define the feature columns and target column
feature_cols = ["checking_account_status", "duration", "credit_history", "purpose", "credit_amount",
                "savings_account_status", "employment_status", "installment_rate", "personal_status",
                "other_debtors", "residence_history", "property", "age", "other_installment_plans",
                "housing", "number_of_existing_credits", "job", "dependents", "telephone", "foreign_worker"]
target_col = "class"

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data[feature_cols], data[target_col], test_size=0.2, random_state=42)

#one-hot encode the categorical features
cat_transformer = ColumnTransformer(transformers=[("onehot", OneHotEncoder(), [0, 2, 3, 5, 6, 8, 9, 11, 14, 16, 18, 19])], sparse_threshold = 0)
X_train = cat_transformer.fit_transform(X_train)
X_test = cat_transformer.transform(X_test)

#clculate the cost matrix
cost_m = [[0, 5],
          [1, 0]]
#the "problem" is labeling a Bad client Good (FP = 5) and not so much the other way around (FN = 1)
names = ['Random Forest', 'Linear SVM', 'Naive Bayes']
classifiers = [RandomForestClassifier(n_estimators=100, random_state=0),
               SVC(kernel='linear', probability=True), GaussianNB()]

Pure approach - no weights taken into consideration:

In [None]:

#Pure approach - no weights taken into consideration
for name, clf in zip(names, classifiers):
  print(name)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=["Good", "Bad"]))
  conf_m = confusion_matrix(y_test, y_pred) # transpose to align with slides
  print(conf_m)
  print(np.sum(conf_m * cost_m))


Random Forest
              precision    recall  f1-score   support

        Good       0.77      0.89      0.83       141
         Bad       0.58      0.36      0.44        59

    accuracy                           0.73       200
   macro avg       0.68      0.62      0.63       200
weighted avg       0.71      0.73      0.71       200

[[126  15]
 [ 38  21]]
113
Linear SVM
              precision    recall  f1-score   support

        Good       0.79      0.91      0.84       141
         Bad       0.65      0.41      0.50        59

    accuracy                           0.76       200
   macro avg       0.72      0.66      0.67       200
weighted avg       0.74      0.76      0.74       200

[[128  13]
 [ 35  24]]
100
Naive Bayes
              precision    recall  f1-score   support

        Good       0.84      0.65      0.74       141
         Bad       0.46      0.71      0.56        59

    accuracy                           0.67       200
   macro avg       0.65      0.68    

The linear SVM model has the best accuracy (76%) and cost (100). It also has a pretty good accuracy and recall for the "Good" class, which is the one we're most concerned in properly predicting.

When compared to the linear SVM model, the RandomForest model has greater recall for the "Bad" class but poorer precision and overall accuracy.

The Naive Bayes model is the least accurate (67%) and the most expensive (262). This implies that it misclassifies many cases and has a high cost of misclassification.

Weighted apporach


In [None]:
print("Weighted apporach:")
weights = np.zeros(y_train.shape[0])
weights[np.where(y_train == 1)] = 1;
weights[np.where(y_train == 2)] = 5;
for name, clf in zip(names, classifiers):
    print(name, "(Weighted apporach)")
    clf.fit(X_train, y_train, weights)
    pred_test = clf.predict(X_test)
    print(classification_report(y_test, pred_test, target_names=["Good", "Bad"]))
    conf_m = confusion_matrix(y_test, pred_test) # transpose to align with slides
    print(conf_m)
    loss = np.sum(conf_m * cost_m)
    print("%d\n" %loss)


Weighted apporach:
Random Forest (Weighted apporach)
              precision    recall  f1-score   support

        Good       0.80      0.93      0.86       141
         Bad       0.72      0.44      0.55        59

    accuracy                           0.79       200
   macro avg       0.76      0.68      0.70       200
weighted avg       0.78      0.79      0.77       200

[[131  10]
 [ 33  26]]
83

Linear SVM (Weighted apporach)
              precision    recall  f1-score   support

        Good       0.92      0.48      0.63       141
         Bad       0.42      0.90      0.57        59

    accuracy                           0.60       200
   macro avg       0.67      0.69      0.60       200
weighted avg       0.77      0.60      0.61       200

[[67 74]
 [ 6 53]]
376

Naive Bayes (Weighted apporach)
              precision    recall  f1-score   support

        Good       0.87      0.57      0.69       141
         Bad       0.44      0.80      0.57        59

    accuracy   

Compared to the unweighted approach, the weighted approach showed some improvements in the performance metrics. For instance, the Random Forest classifier's accuracy and recall for identifying the "Bad" class improved, as well as the f1-score for predicting the "Good" class.

However, the Linear SVM and Naive Bayes classifiers did not perform as well with the weighted strategy. The Linear SVM classifier's accuracy and recall for predicting the "Good" class decreased, while the Naive Bayes classifier's precision decreased for both classes. Additionally, except for the Random Forest classifier, all classifiers suffered greater losses with the weighted strategy.

In conclusion, the weighted strategy produced mixed results and may not always lead to better performance. Therefore, it's essential to carefully consider the trade-offs between different performance measures and the costs associated with misclassification when selecting a classification strategy.

(Over)sampling approach

In [None]:
print("(Over)sampling approach:")
sampler = RandomOverSampler(sampling_strategy={1: 559, 2: 559}, random_state=1)
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
#print(Counter(y_rs))
for name, clf in zip(names, classifiers):
    print(name, "(Over)sampling approach")
    model = clf.fit(X_rs, y_rs)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=["Good", "Bad"]))
    conf_m = confusion_matrix(y_test, y_pred) # transpose to align with slides
    print(conf_m)
    loss = np.sum(conf_m * cost_m)
    print("%d\n" %loss)

(Over)sampling approach:
Random Forest (Over)sampling approach
              precision    recall  f1-score   support

        Good       0.81      0.82      0.81       141
         Bad       0.55      0.53      0.54        59

    accuracy                           0.73       200
   macro avg       0.68      0.67      0.68       200
weighted avg       0.73      0.73      0.73       200

[[116  25]
 [ 28  31]]
153

Linear SVM (Over)sampling approach
              precision    recall  f1-score   support

        Good       0.91      0.62      0.73       141
         Bad       0.48      0.85      0.61        59

    accuracy                           0.69       200
   macro avg       0.69      0.73      0.67       200
weighted avg       0.78      0.69      0.70       200

[[87 54]
 [ 9 50]]
279

Naive Bayes (Over)sampling approach
              precision    recall  f1-score   support

        Good       0.89      0.54      0.67       141
         Bad       0.43      0.85      0.57        

The findings of the (over)sampling strategy show that it outperforms the initial imbalanced dataset, although probably not as well as the weighted approach. The accuracy, recall, and f1-score for the minority class (Bad) for the Random Forest classifier increased marginally when compared to the initial imbalanced dataset.

However, when compared to the weighted strategy, the performance of the Linear SVM and Naive Bayes classifiers declined. It's worth mentioning that the (over)sampling strategy has a smaller loss than the initial imbalanced dataset but a bigger loss than the weighted approach.

Cost minimization without probability calibration

In [None]:
print("Cost minimization without probability calibration")
for name, clf in zip(names, classifiers):
    print(name, "cost minimization without probability calibration")
    model = clf.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)
    y_pred = np.argmin(np.matmul(y_pred_prob, np.array(cost_m)), axis=1) + 1
    print("Class difference:", set(y_test) - set(y_pred))
    print(classification_report(y_test, y_pred, target_names=["Good", "Bad"]))
    conf_m = confusion_matrix(y_test, y_pred) # transpose to align with slides
    print(conf_m)
    print(np.sum(conf_m * cost_m))

Cost minimization without probability calibration
Random Forest cost minimization without probability calibration
Class difference: {2}
              precision    recall  f1-score   support

        Good       0.70      1.00      0.83       141
         Bad       0.00      0.00      0.00        59

    accuracy                           0.70       200
   macro avg       0.35      0.50      0.41       200
weighted avg       0.50      0.70      0.58       200

[[141   0]
 [ 59   0]]
59
Linear SVM cost minimization without probability calibration
Class difference: {2}
              precision    recall  f1-score   support

        Good       0.70      1.00      0.83       141
         Bad       0.00      0.00      0.00        59

    accuracy                           0.70       200
   macro avg       0.35      0.50      0.41       200
weighted avg       0.50      0.70      0.58       200

[[141   0]
 [ 59   0]]
59
Naive Bayes cost minimization without probability calibration
Class differe

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The cost minimization strategy without probability calibration appears to simply forecast all samples to belong to the majority class for the Random Forest and Linear SVM classifiers, resulting in very poor recall and F1-score for the minority class (Bad). The confusion matrix for both classifiers shows that all predicted samples are in the Good class, and the cost associated with this approach is simply the cost of misclassifying the minority class for all samples, which is equal to the number of minority class samples in the test set.

The cost minimization strategy without probability calibration, on the other hand, outperforms the Naive Bayes classifier and accurately predicts certain minority class samples. The confusion matrix reveals that the minority class has some genuine positive and false positive predictions, resulting in a non-zero cost. The classification report demonstrates that the minority class has superior accuracy, recall, and F1-score than the Random Forest and Linear SVM classifiers.

The warning "Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples" indicates that there are only samples of one class in the predicted values - this seems like a technical issue that I could not overcome.

Cost minimization with isotonic/sigmoid calibration

In [None]:
print("Cost minimization with isotonic/sigmoid calibration")
for name, clf in zip(names, classifiers):
    cc = CalibratedClassifierCV(clf, method="sigmoid", cv=3) #switch the "method" field to isotonic/sigmoid to try both
    print("Cost minimization (with sigmoid calibration):", name)
    model = cc.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)
    y_pred = np.argmin(np.matmul(y_pred_prob, np.array(cost_m)), axis=1) + 1
    print("Class difference:", set(y_test) - set(y_pred))
    print(classification_report(y_test, y_pred, target_names=["Good", "Bad"]))
    conf_m = confusion_matrix(y_test, y_pred) # transpose to align with slides
    print(conf_m)

Cost minimization with isotonic/sigmoid calibration
Cost minimization (with sigmoid calibration): Random Forest
Class difference: {2}
              precision    recall  f1-score   support

        Good       0.70      1.00      0.83       141
         Bad       0.00      0.00      0.00        59

    accuracy                           0.70       200
   macro avg       0.35      0.50      0.41       200
weighted avg       0.50      0.70      0.58       200

[[141   0]
 [ 59   0]]
Cost minimization (with sigmoid calibration): Linear SVM


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Class difference: {2}
              precision    recall  f1-score   support

        Good       0.70      1.00      0.83       141
         Bad       0.00      0.00      0.00        59

    accuracy                           0.70       200
   macro avg       0.35      0.50      0.41       200
weighted avg       0.50      0.70      0.58       200

[[141   0]
 [ 59   0]]
Cost minimization (with sigmoid calibration): Naive Bayes
Class difference: {2}
              precision    recall  f1-score   support

        Good       0.70      1.00      0.83       141
         Bad       0.00      0.00      0.00        59

    accuracy                           0.70       200
   macro avg       0.35      0.50      0.41       200
weighted avg       0.50      0.70      0.58       200

[[141   0]
 [ 59   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It appears that the classifiers are underperforming because they all have a precision of 0.0 for the "Bad" class and an F1-score of 0.0. This suggests that the classifiers are completely incapable of identifying the "Bad" class. Furthermore, several "UndefinedMetricWarning" messages again indicate that there are no expected samples for the "Bad" class, leaving the accuracy and F1-score undefined.