## Benchmark the performance of the LogRegCCD 

Comparing to the sklearn LogisticRegression model as a baseline.

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

if project_root not in sys.path:
    sys.path.append(project_root)


from sklearn.linear_model import LogisticRegression  # noqa: E402
from sklearn.model_selection import train_test_split  # noqa: E402
from sklearn.metrics import balanced_accuracy_score, f1_score  # noqa: E402

from algorithm.ccd import LogRegCCD  # noqa: E402
from datasets.synthetic import generate_synthetic_data  # noqa: E402
from datasets.real import get_dataset_1, get_dataset_2  # noqa: E402

In [4]:
synthetic_x, synthetic_y = generate_synthetic_data(0.5, 10000, 50, 1)
real_x1, real_y1 = get_dataset_1(
    path="C:/Users/SPCX/Desktop/github-repositories/aml-ccd/ccd/datasets/dataset_1.arff"
)
real_x2, real_y2 = get_dataset_2(
    path="C:/Users/SPCX/Desktop/github-repositories/aml-ccd/ccd/datasets/dataset_2.arff"
)

In [5]:
train_synthetic_x, test_synthetic_x, train_synthetic_y, test_synthetic_y = (
    train_test_split(synthetic_x, synthetic_y, test_size=0.2, random_state=42)
)
train_real_x1, test_real_x1, train_real_y1, test_real_y1 = train_test_split(
    real_x1, real_y1, test_size=0.2, random_state=42
)
train_real_x2, test_real_x2, train_real_y2, test_real_y2 = train_test_split(
    real_x2, real_y2, test_size=0.2, random_state=42
)

In [9]:
# scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_synthetic_x = scaler.fit_transform(train_synthetic_x)
test_synthetic_x = scaler.transform(test_synthetic_x)
train_real_x1 = scaler.fit_transform(train_real_x1)
test_real_x1 = scaler.transform(test_real_x1)
train_real_x2 = scaler.fit_transform(train_real_x2)
test_real_x2 = scaler.transform(test_real_x2)

In [13]:
print("###### LogisticRegression ######")
lr_synthetic = LogisticRegression(max_iter=1000)
lr_synthetic.fit(train_synthetic_x, train_synthetic_y)
print(
    "f1 score synthetic data: ",
    f1_score(test_synthetic_y, lr_synthetic.predict(test_synthetic_x)),
)
print(
    "balanced accuracy score synthetic data: ",
    balanced_accuracy_score(test_synthetic_y, lr_synthetic.predict(test_synthetic_x)),
)
print("")
lr_real1 = LogisticRegression(max_iter=1000)
lr_real1.fit(train_real_x1, train_real_y1)
print("f1 score real data 1: ", f1_score(test_real_y1, lr_real1.predict(test_real_x1)))
print(
    "balanced accuracy score real data 1: ",
    balanced_accuracy_score(test_real_y1, lr_real1.predict(test_real_x1)),
)
print("")
lr_real2 = LogisticRegression(max_iter=1000)
lr_real2.fit(train_real_x2, train_real_y2)
print("f1 score real data 2: ", f1_score(test_real_y2, lr_real2.predict(test_real_x2)))
print(
    "balanced accuracy score real data 2: ",
    balanced_accuracy_score(test_real_y2, lr_real2.predict(test_real_x2)),
)
print("")

print("###### LogRegCCD ######")
ccd_synthetic = LogRegCCD()
ccd_synthetic.fit(train_synthetic_x, train_synthetic_y)
print(
    "f1 score synthetic data: ",
    f1_score(test_synthetic_y, ccd_synthetic.predict(test_synthetic_x)),
)
print(
    "balanced accuracy score synthetic data: ",
    balanced_accuracy_score(test_synthetic_y, ccd_synthetic.predict(test_synthetic_x)),
)
print("")

###### LogisticRegression ######
f1 score synthetic data:  1.0
balanced accuracy score synthetic data:  1.0

f1 score real data 1:  0.727979274611399
balanced accuracy score real data 1:  0.7651528321019847

f1 score real data 2:  0.3076923076923077
balanced accuracy score real data 2:  0.5338435780913657

###### LogRegCCD ######


AttributeError: 'int' object has no attribute 'copy'