In [1]:
import pathlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    recall_score,
    accuracy_score,
    balanced_accuracy_score,
)

from imblearn.metrics import (
    geometric_mean_score,
    make_index_balanced_accuracy,
)

In [2]:
datasets_folder = pathlib.Path("../../../Datasets")
dataset_file = "kdd2004.csv"
dataset_path = datasets_folder/dataset_file

if not dataset_path.exists():
    raise FileExistsError(f"The file whose path {dataset_path.resolve()} doesn't exist")

In [4]:
df = pd.read_csv(dataset_path)

df["target"] = df["target"].map({-1:0, 1:1})
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
0,52.0,32.69,0.3,2.5,20.0,1256.8,-0.89,0.33,11.0,-55.0,...,1595.1,-1.64,2.83,-2.0,-50.0,445.2,-0.35,0.26,0.76,0
1,58.0,33.33,0.0,16.5,9.5,608.1,0.5,0.07,20.5,-52.5,...,762.9,0.29,0.82,-3.0,-35.0,140.3,1.16,0.39,0.73,0
2,77.0,27.27,-0.91,6.0,58.5,1623.6,-1.4,0.02,-6.5,-48.0,...,1491.8,0.32,-1.29,0.0,-34.0,658.2,-0.76,0.26,0.24,0
3,41.0,27.91,-0.35,3.0,46.0,1921.6,-1.36,-0.47,-32.0,-51.5,...,2047.7,-0.98,1.53,0.0,-49.0,554.2,-0.83,0.39,0.73,0
4,50.0,28.0,-1.32,-9.0,12.0,464.8,0.88,0.19,8.0,-51.5,...,479.5,0.68,-0.59,2.0,-36.0,-6.9,2.02,0.14,-0.23,0


In [5]:
df["target"].value_counts(normalize=True)

0    0.991108
1    0.008892
Name: target, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=["target"], axis=1),
    df["target"],
    test_size=0.3,
)

X_train.shape, X_test.shape

((102025, 74), (43726, 74))

In [8]:
# Baseline model
y_train_baseline = pd.Series(np.zeros_like(y_train))
y_test_baseline = pd.Series(np.zeros_like(y_test))

# Random forest
rf = RandomForestClassifier(n_estimators=100, max_depth=2)
rf.fit(X_train, y_train)

# Logistic regression
logit = LogisticRegression(max_iter=1000)
logit.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=1000)

In [9]:
# TPR = Recall = tp / (tp + fn)
print(f"Recall - Baseline model: {recall_score(y_test, y_test_baseline)}")
print(f"Recall - Random forest model: {recall_score(y_test, rf.predict(X_test))}")
print(f"Recall - Logistic regression model: {recall_score(y_test, logit.predict(X_test))}")

Recall - Baseline model: 0.0
Recall - Random forest model: 0.5800524934383202
Recall - Logistic regression model: 0.7007874015748031


In [11]:
# TNR = tn / (tn + fp)
# We can use recall_score function but the positive label should be the negative class
print(f"TNR - Baseline model: {recall_score(y_test, y_test_baseline, pos_label=0)}")
print(f"TNR - Random forest model: {recall_score(y_test, rf.predict(X_test), pos_label=0)}")
print(f"TNR - Logistic regression model: {recall_score(y_test, logit.predict(X_test), pos_label=0)}")

TNR - Baseline model: 1.0
TNR - Random forest model: 0.9999307878648056
TNR - Logistic regression model: 0.9995385857653709


In [12]:
# Geometric mean = sqrt(recall * TNR)
print(f"G-mean - Baseline model: {geometric_mean_score(y_test, y_test_baseline)}")
print(f"G-mean - Random forest model: {geometric_mean_score(y_test, rf.predict(X_test))}")
print(f"G-mean - Logistic regression model: {geometric_mean_score(y_test, logit.predict(X_test))}")

G-mean - Baseline model: 0.0
G-mean - Random forest model: 0.7615854165927316
G-mean - Logistic regression model: 0.836937302485836


In [13]:
# Dominance = TPR - TNR
def dominance(y_true, y_pred):
    tpr = recall_score(y_true, y_pred, pos_label=1)
    tnr = recall_score(y_true, y_pred, pos_label=0)
    return tpr - tnr

In [14]:
print(f"Dominance - Baseline model: {dominance(y_test, y_test_baseline)}")
print(f"Dominance - Random forest model: {dominance(y_test, rf.predict(X_test))}")
print(f"Dominance - Logistic regression model: {dominance(y_test, logit.predict(X_test))}")

Dominance - Baseline model: -1.0
Dominance - Random forest model: -0.4198782944264854
Dominance - Logistic regression model: -0.2987511841905678


In [15]:
# Index balanced accuracy applied on geometric mean
balanced_gmean = make_index_balanced_accuracy(alpha=0.5, squared=True)(geometric_mean_score)

print(f"Balanced geometric mean - Baseline model: {balanced_gmean(y_test, y_test_baseline)}")
print(f"Balanced geometric mean - Random forest model: {balanced_gmean(y_test, rf.predict(X_test))}")
print(f"Balanced geometric mean - Logistic regression model: {balanced_gmean(y_test, logit.predict(X_test))}")

Balanced geometric mean - Baseline model: 0.0
Balanced geometric mean - Random forest model: 0.5800123467667245
Balanced geometric mean - Logistic regression model: 0.7004640482922677


In [16]:
# Comparison of the accuracy and the balanced accuracy using IBA
print("---------- Accuracy -----------------")
print(f"Accuracy - Baseline model: {accuracy_score(y_test, y_test_baseline)}")
print(f"Accuracy - Random forest model: {accuracy_score(y_test, rf.predict(X_test))}")
print(f"Accuracy - Logistic regression model: {accuracy_score(y_test, logit.predict(X_test))}")

balanced_accuracy = make_index_balanced_accuracy(alpha=0.5, squared=True)(accuracy_score)

print("---------- Balanced Accuracy using IBA -----------------")
print(f"Accuracy - Baseline model: {balanced_accuracy(y_test, y_test_baseline)}")
print(f"Accuracy - Random forest model: {balanced_accuracy(y_test, rf.predict(X_test))}")
print(f"Accuracy - Logistic regression model: {balanced_accuracy(y_test, logit.predict(X_test))}")



---------- Accuracy -----------------
Accuracy - Baseline model: 0.991286648675845
Accuracy - Random forest model: 0.9962722407720807
Accuracy - Logistic regression model: 0.9969354617390112
---------- Balanced Accuracy -----------------
Accuracy - Baseline model: 0.4913246099214941
Accuracy - Random forest model: 0.7841815183523922
Accuracy - Logistic regression model: 0.8454188543668073
---------- Built-in Balanced Accuracy -----------------
Accuracy - Baseline model: 0.5
Accuracy - Random forest model: 0.7899916406515629
Accuracy - Logistic regression model: 0.850162993670087


## Balanced accuracy

Balanced accuracy is just the average of the model accuracy for each class.

![Balanced accuracy](../../../_assets/Balanced_accuracy.PNG)

Note: The image above is a screenshot from one of Andreas Mueller lectures: Applied ML 2020 - 09 - Model Evaluation and Metrics

In [None]:
print("---------- Built-in Balanced Accuracy -----------------")
print(f"Accuracy - Baseline model: {balanced_accuracy_score(y_test, y_test_baseline)}")
print(f"Accuracy - Random forest model: {balanced_accuracy_score(y_test, rf.predict(X_test))}")
print(f"Accuracy - Logistic regression model: {balanced_accuracy_score(y_test, logit.predict(X_test))}")