In [1]:
import pathlib
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [2]:
# Dataset path
dataset_folder = pathlib.Path("../../Datasets")
dataset_file = "kdd2004.csv"
dataset_path = dataset_folder/dataset_file

In [3]:
df = pd.read_csv(dataset_path)

# A remapping of target classes
df["target"] = df["target"].map({-1:0, 1:1})

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
0,52.0,32.69,0.3,2.5,20.0,1256.8,-0.89,0.33,11.0,-55.0,...,1595.1,-1.64,2.83,-2.0,-50.0,445.2,-0.35,0.26,0.76,0
1,58.0,33.33,0.0,16.5,9.5,608.1,0.5,0.07,20.5,-52.5,...,762.9,0.29,0.82,-3.0,-35.0,140.3,1.16,0.39,0.73,0
2,77.0,27.27,-0.91,6.0,58.5,1623.6,-1.4,0.02,-6.5,-48.0,...,1491.8,0.32,-1.29,0.0,-34.0,658.2,-0.76,0.26,0.24,0
3,41.0,27.91,-0.35,3.0,46.0,1921.6,-1.36,-0.47,-32.0,-51.5,...,2047.7,-0.98,1.53,0.0,-49.0,554.2,-0.83,0.39,0.73,0
4,50.0,28.0,-1.32,-9.0,12.0,464.8,0.88,0.19,8.0,-51.5,...,479.5,0.68,-0.59,2.0,-36.0,-6.9,2.02,0.14,-0.23,0


In [4]:
df.shape

(145751, 75)

In [5]:
# Proportion of each class
df.target.value_counts(normalize=True)

0    0.991108
1    0.008892
Name: target, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=["target"], axis=1),
    df["target"],
    test_size=0.3
)

In [7]:
X_train.shape, X_test.shape

((102025, 74), (43726, 74))

## Defining a baseline model + RF + Logistic Regression
Note: The modelling step is not at all the goal, so we just create models with default parameters/no tweaking

In [8]:
# Baseline prediction: Always predicting the majority class
y_train_baseline = pd.Series(np.zeros_like(y_train))
y_test_baseline = pd.Series(np.zeros_like(y_test))

In [9]:
rf = RandomForestClassifier(n_estimators=100, max_depth=2)
rf.fit(X_train, y_train)

y_train_rf = rf.predict_proba(X_train)[:1]
y_test_rf = rf.predict_proba(X_test)[:1]

In [10]:
logit = LogisticRegression(max_iter=1000)

logit.fit(X_train, y_train)

y_train_logit = logit.predict_proba(X_train)[:, 1]
y_test_logit = logit.predict_proba(X_test)[:, 1]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
print(f"Accuracy Baseline test: {accuracy_score(y_test, y_test_baseline)}")
print(f"Accuracy Random Forest test: {accuracy_score(y_test, rf.predict(X_test))}")
print(f"Accuracy Logistic Regression test: {accuracy_score(y_test, logit.predict(X_test))}")

Accuracy Baseline test: 0.9905090792663404
Accuracy Random Forest test: 0.9959520651328729
Accuracy Logistic Regression test: 0.9968897223619815


## Percentage of the minority class correctly classified

In [12]:
def minority_accuracy(y_true, y_pred):
    """Calculate accuracy of predicting the minority class.
    We suppose that the minority class is class 1.
    """
    return 100 * np.sum((y_true == 1) & (y_pred == 1)) / np.sum(y_true == 1)

In [13]:
print(f"% Accuracy on minority class - Baseline model: {minority_accuracy(y_test, y_test_baseline)}")
print(f"% Accuracy on minority class - Random Forest model: {minority_accuracy(y_test, rf.predict(X_test))}")
print(f"% Accuracy on minority class - Logistic Regression model: {minority_accuracy(y_test, logit.predict(X_test))}")

% Accuracy on minority class - Baseline model: 0.0
% Accuracy on minority class - Random Forest model: 57.83132530120482
% Accuracy on minority class - Logistic Regression model: 72.53012048192771
