In [5]:
import pandas as pd
import numpy as np
np.random.seed(1234)
import matplotlib.pyplot as plt
import seaborn as sns

# be careful with that:
import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, accuracy_score, confusion_matrix, recall_score, pairwise_distances_argmin_min
from sklearn.metrics import precision_score, f1_score
from sklearn.svm import OneClassSVM

# Training and Hyperparameter Tuning

## Import Data

In [2]:
# read the training data of data I from datasets folder
!python3 pca_pipeline_combine_gridid.py 200 '../datasets/train/data_heatmap_train.csv'

pca_df = pd.read_csv('temp/pca_df.csv')
pca_df = pca_df.iloc[:,1:]
pca_df_inp = pca_df.iloc[:,:-2]
y_train = pca_df['label'].values

Nb components:  200
Data directory:  ../datasets/train/data_heatmap_train.csv


## One Class SVM

In [6]:
pca_df_inp = pca_df.iloc[:,:-2]
best_acc = -float('inf')
best_dim = -1
best_recall = -1
best_prec = -1
best_f1 = -1

# try different values of PCA dimension
for pca_dim in np.linspace(2,200,198).astype(int):
    X = pca_df_inp.iloc[:,:pca_dim]
    
    oneclass = OneClassSVM(gamma = 'auto').fit(X)
    oneclass_labels = oneclass.predict(X)
    
    oneclass_labels = np.where(oneclass_labels == 1, 0,1)
    
    conf_mat   = confusion_matrix(y_train,oneclass_labels)
    acc        = accuracy_score(y_train,oneclass_labels)
    recall     = recall_score(y_train,oneclass_labels)
    prec       = precision_score(y_train,oneclass_labels)
    f1         = f1_score(y_train,oneclass_labels,average='macro')
    
    # store the current best macro f1 and other corresponding metrics
    if f1 > best_f1:
        best_acc = acc
        best_dim = pca_dim
        best_recall  = recall
        best_prec = prec
        best_f1 = f1
        
        best_oneclass_labels = oneclass_labels
        
        print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
              f' {conf_mat}')
        print(f'Accuracy for {pca_dim} PCA dimensions:', 
              f' {acc:.2f}')
        print(f'Recall for {pca_dim} PCA dimensions:', 
              f' {recall:.2f}')
        print(f'Precision for {pca_dim} PCA dimensions:', 
              f' {prec:.2f}')
        print(f'F1 for {pca_dim} PCA dimensions:', 
              f' {f1:.2f}')
        print(f'---------')

Confusion matrix for 2 PCA dimensions:
  [[609 484]
 [  0 119]]
Accuracy for 2 PCA dimensions:  0.60
Recall for 2 PCA dimensions:  1.00
Precision for 2 PCA dimensions:  0.20
F1 for 2 PCA dimensions:  0.52
---------
Confusion matrix for 14 PCA dimensions:
  [[614 479]
 [  0 119]]
Accuracy for 14 PCA dimensions:  0.60
Recall for 14 PCA dimensions:  1.00
Precision for 14 PCA dimensions:  0.20
F1 for 14 PCA dimensions:  0.53
---------
Confusion matrix for 39 PCA dimensions:
  [[617 476]
 [  0 119]]
Accuracy for 39 PCA dimensions:  0.61
Recall for 39 PCA dimensions:  1.00
Precision for 39 PCA dimensions:  0.20
F1 for 39 PCA dimensions:  0.53
---------
Confusion matrix for 51 PCA dimensions:
  [[618 475]
 [  0 119]]
Accuracy for 51 PCA dimensions:  0.61
Recall for 51 PCA dimensions:  1.00
Precision for 51 PCA dimensions:  0.20
F1 for 51 PCA dimensions:  0.53
---------
Confusion matrix for 153 PCA dimensions:
  [[700 393]
 [ 26  93]]
Accuracy for 153 PCA dimensions:  0.65
Recall for 153 PCA d

## KNN

In [7]:
pca_df_inp = pca_df.iloc[:,:-2]
kchoices = [1,2,5,10]
thres    = [90, 95, 99]
best_acc = -float('inf')
best_dim = -1
best_f1 = -float('inf')

# try different values of k, threshold and PCA dimension 
for pca_dim in np.linspace(2,200,100).astype(int):
    for ks in kchoices:
        for an_thres in thres:
            X   = pca_df_inp.iloc[:,:pca_dim]
            knn = NearestNeighbors(n_neighbors = ks, 
                                   algorithm = 'auto', 
                                   metric = 'euclidean')

            knn_fit = knn.fit(X)
            distances, indices = knn.kneighbors(X)
            anomaly_scores = distances.mean(axis=1)

            threshold  = np.percentile(anomaly_scores, an_thres)
            knn_labels = anomaly_scores > threshold

            conf_mat = confusion_matrix(y_train,knn_labels)
            acc        = accuracy_score(y_train,knn_labels)
            f1         = f1_score(y_train,knn_labels,average='macro')

            if f1 > best_f1:
                best_knn_labels = knn_labels
                best_acc = acc
                best_dim = pca_dim
                best_f1 = f1

                print(f'Confusion matrix for {pca_dim} PCA dimensions, {an_thres} threshold and {ks}-NN:\n', 
                      f' {conf_mat}')
                print(f'Accuracy for {pca_dim} PCA dimensions, {an_thres} threshold and {ks}-NN:', 
                      f' {acc:.2f}')
                print(f'F1 for {pca_dim} PCA dimensions, {an_thres} threshold and {ks}-NN:', 
                      f' {f1:.2f}')
                print(f'---------')

Confusion matrix for 2 PCA dimensions, 90 threshold and 1-NN:
  [[1093    0]
 [ 119    0]]
Accuracy for 2 PCA dimensions, 90 threshold and 1-NN:  0.90
F1 for 2 PCA dimensions, 90 threshold and 1-NN:  0.47
---------
Confusion matrix for 2 PCA dimensions, 90 threshold and 2-NN:
  [[1042   51]
 [  48   71]]
Accuracy for 2 PCA dimensions, 90 threshold and 2-NN:  0.92
F1 for 2 PCA dimensions, 90 threshold and 2-NN:  0.77
---------
Confusion matrix for 2 PCA dimensions, 90 threshold and 5-NN:
  [[1043   50]
 [  47   72]]
Accuracy for 2 PCA dimensions, 90 threshold and 5-NN:  0.92
F1 for 2 PCA dimensions, 90 threshold and 5-NN:  0.78
---------
Confusion matrix for 2 PCA dimensions, 90 threshold and 10-NN:
  [[1049   44]
 [  41   78]]
Accuracy for 2 PCA dimensions, 90 threshold and 10-NN:  0.93
F1 for 2 PCA dimensions, 90 threshold and 10-NN:  0.80
---------
Confusion matrix for 4 PCA dimensions, 90 threshold and 2-NN:
  [[1053   40]
 [  37   82]]
Accuracy for 4 PCA dimensions, 90 threshold an

## Kmeans

In [8]:
# distance from centroid approach
kchoices = [2,3,4,5,6]
best_f1 = -1

# try different values of k and PCA dimension 
for k in kchoices:
    for pca_dim in np.linspace(2,200,50).astype(int):
        X = pca_df_inp.iloc[:,list(range(pca_dim)) + [-1]]
        # one_hot = pd.get_dummies(X['grid_id'], prefix='grid')
        # X = X.join(one_hot)

        kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
        centroids = kmeans.cluster_centers_
        labels = kmeans.labels_

        # Calculate the distance from each point to its cluster centroid
        closest, distances = pairwise_distances_argmin_min(X, centroids[labels])
        # distances = euclidean_distances(data_normalized, centroids[labels])

        # Choose a threshold for flagging an anomaly
        threshold = np.percentile(distances, 95)

        # Anything above the threshold is considered an anomaly
        anomalies = pca_df_inp[distances > threshold]

        kmeans_labels = (distances > threshold).astype(int)
      
        f1 = f1_score(y_train,kmeans_labels,average='macro')
      
        if f1 > best_f1:
            best_acc = accuracy_score(y_train,kmeans_labels)
            best_dim = pca_dim
            best_k = k
            best_recall  = recall_score(y_train,kmeans_labels)
            best_prec = precision_score(y_train,kmeans_labels)
            best_conf_mat = confusion_matrix(y_train,kmeans_labels)
            best_f1 = f1

print(f'Best confusion matrix: \n{best_conf_mat}')
print(f'Best PCA dimension: {best_dim}')
print(f'Best K: {best_k}')
print(f'Best accuracy: {best_acc:.2f}')
print(f'Best precision: {best_prec:.2f}')
print(f'Best recall: {best_recall:.2f}')
print(f'Best f1-score: {best_f1:.2f}')

Best confusion matrix: 
[[1090    3]
 [  61   58]]
Best PCA dimension: 2
Best K: 2
Best accuracy: 0.95
Best precision: 0.95
Best recall: 0.49
Best f1-score: 0.81


## DBScan

In [9]:
pca_df_inp = pca_df.iloc[:,:-2]
epsilons = [5,10,20,30,40,50,60,70,80,90,100]
min_samps = [2,3,4,5,10]

best_acc = -float('inf')
best_dim = -1
best_recall = -1
best_prec = -1
best_f1 = -1

# try different values of epsilon, minimum samples and PCA dimension 
for pca_dim in np.linspace(2,200,100).astype(int):
    for eps in epsilons:
        for ms in min_samps:

            X   = pca_df_inp.iloc[:,:pca_dim]

            dbscan = DBSCAN(eps = eps, min_samples = ms)
            dbscan_labels = dbscan.fit_predict(X)
            dbscan_labels = np.where(dbscan_labels>=0, 0, 1)

            conf_mat   = confusion_matrix(y_train,dbscan_labels)
            acc        = accuracy_score(y_train,dbscan_labels)
            recall     = recall_score(y_train,dbscan_labels)
            prec       = precision_score(y_train,dbscan_labels)
            f1         = f1_score(y_train,dbscan_labels,average='macro')

            if f1 > best_f1:
                best_acc = acc
                best_dim = pca_dim
                best_recall  = recall
                best_prec = prec
                best_dbscan_labels = dbscan_labels
                best_f1 = f1

                print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
                      f' {conf_mat}')
                print(f'Accuracy for {pca_dim} PCA dimensions:', 
                      f' {acc:.2f}')
                print(f'Recall for {pca_dim} PCA dimensions:', 
                      f' {recall:.2f}')
                print(f'Precision for {pca_dim} PCA dimensions:', 
                      f' {prec:.2f}')
                print(f'F1 for {pca_dim} PCA dimensions, {eps} eps, and {ms} min_samps :', 
                      f' {f1:.2f}')
                print(f'---------')

Confusion matrix for 2 PCA dimensions:
  [[1093    0]
 [ 115    4]]
Accuracy for 2 PCA dimensions:  0.91
Recall for 2 PCA dimensions:  0.03
Precision for 2 PCA dimensions:  1.00
F1 for 2 PCA dimensions, 5 eps, and 2 min_samps :  0.51
---------
Confusion matrix for 2 PCA dimensions:
  [[1093    0]
 [ 111    8]]
Accuracy for 2 PCA dimensions:  0.91
Recall for 2 PCA dimensions:  0.07
Precision for 2 PCA dimensions:  1.00
F1 for 2 PCA dimensions, 5 eps, and 3 min_samps :  0.54
---------
Confusion matrix for 2 PCA dimensions:
  [[1093    0]
 [ 107   12]]
Accuracy for 2 PCA dimensions:  0.91
Recall for 2 PCA dimensions:  0.10
Precision for 2 PCA dimensions:  1.00
F1 for 2 PCA dimensions, 5 eps, and 5 min_samps :  0.57
---------
Confusion matrix for 2 PCA dimensions:
  [[1093    0]
 [ 101   18]]
Accuracy for 2 PCA dimensions:  0.92
Recall for 2 PCA dimensions:  0.15
Precision for 2 PCA dimensions:  1.00
F1 for 2 PCA dimensions, 5 eps, and 10 min_samps :  0.61
---------
Confusion matrix for 4 

## Isolation Forest

In [10]:
pca_df_inp = pca_df.iloc[:,:-2]
n_estims = [5,10,50,100,150,200]
best_acc = -float('inf')
best_dim = -1
best_recall = -1
best_prec = -1
best_f1 = -1

# try different values of number of estimators and PCA dimension 
for pca_dim in np.linspace(2,200,100).astype(int):
    for nests in n_estims:
            X = pca_df_inp.iloc[:,:pca_dim]
            
            isolation_forest = IsolationForest(n_estimators=nests, 
                                               max_samples= 'auto',
                                               random_state=42)

            if_labels = isolation_forest.fit_predict(X)
            if_labels = np.where(if_labels == -1, 1, 0)

            conf_mat   = confusion_matrix(y_train,if_labels)
            acc        = accuracy_score(y_train,if_labels)
            recall     = recall_score(y_train,if_labels)
            prec       = precision_score(y_train,if_labels)
            f1         = f1_score(y_train,if_labels,average='macro')

            if f1 > best_f1:
                best_acc = acc
                best_dim = pca_dim
                best_recall  = recall
                best_prec = prec
                best_if_labels = if_labels
                best_f1 = f1
                print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
                      f' {conf_mat}')
                print(f'Accuracy for {pca_dim} PCA dimensions:', 
                      f' {acc:.2f}')
                print(f'Recall for {pca_dim} PCA dimensions:', 
                      f' {recall:.2f}')
                print(f'Precision for {pca_dim} PCA dimensions:', 
                      f' {prec:.2f}')
                print(f'F1 for {pca_dim} PCA dimensions, {nests} n_estimator', 
                      f' {f1:.2f}')
                print(f'---------')


Confusion matrix for 2 PCA dimensions:
  [[832 261]
 [ 67  52]]
Accuracy for 2 PCA dimensions:  0.73
Recall for 2 PCA dimensions:  0.44
Precision for 2 PCA dimensions:  0.17
F1 for 2 PCA dimensions, 5 n_estimator  0.54
---------
Confusion matrix for 2 PCA dimensions:
  [[910 183]
 [ 74  45]]
Accuracy for 2 PCA dimensions:  0.79
Recall for 2 PCA dimensions:  0.38
Precision for 2 PCA dimensions:  0.20
F1 for 2 PCA dimensions, 10 n_estimator  0.57
---------
Confusion matrix for 2 PCA dimensions:
  [[977 116]
 [  0 119]]
Accuracy for 2 PCA dimensions:  0.90
Recall for 2 PCA dimensions:  1.00
Precision for 2 PCA dimensions:  0.51
F1 for 2 PCA dimensions, 50 n_estimator  0.81
---------
Confusion matrix for 2 PCA dimensions:
  [[980 113]
 [  0 119]]
Accuracy for 2 PCA dimensions:  0.91
Recall for 2 PCA dimensions:  1.00
Precision for 2 PCA dimensions:  0.51
F1 for 2 PCA dimensions, 100 n_estimator  0.81
---------
Confusion matrix for 4 PCA dimensions:
  [[1075   18]
 [  48   71]]
Accuracy for

## Best combined:

In [11]:
best_acc = -float('inf')
best_dim = -1
best_recall = -1
best_prec = -1
best_f1 = -1

# aggregate labels predicted by a randomly selected subset of the models above
for i1 in [0,1]:
    for i2 in [0,1]:
        for i3 in [0,1]:
            for i4 in [0,1]:
                for i5 in [0,1]:
                    for thresh in [0,1]:
                        tot = i1+i2+i3+i4+i5
                        comb_labels = i1*best_if_labels + i2*best_oneclass_labels + i3*best_dbscan_labels + i4*kmeans_labels + i5*best_knn_labels
                        # set label to 1 if the count of 1's in the collective output surpasses half the total number of models in the ensemble
                        comb_labels = np.where(comb_labels > tot//2 + thresh, 1, 0)

                        conf_mat   = confusion_matrix(y_train,comb_labels)
                        acc        = accuracy_score(y_train,comb_labels)
                        recall     = recall_score(y_train,comb_labels)
                        prec       = precision_score(y_train,comb_labels)
                        f1         = f1_score(y_train,comb_labels,average='macro')

                        if f1 > best_f1:
                            best_acc = acc
                            best_dim = pca_dim
                            best_recall  = recall
                            best_prec = prec
                            best_if_labels = if_labels
                            best_f1 = f1
                            print(f'Confusion matrix for ({i1,i2,i3,i4,i5}) included and {thresh}:\n', 
                                  f' {conf_mat}')
                            print(f'Accuracy for ({i1,i2,i3,i4,i5}) included and {thresh}:', 
                                  f' {acc:.2f}')
                            print(f'Recall for ({i1,i2,i3,i4,i5}) included and {thresh}:', 
                                  f' {recall:.2f}')
                            print(f'Precision for ({i1,i2,i3,i4,i5}) included and {thresh}:', 
                                  f' {prec:.2f}')
                            print(f'F1 for ({i1,i2,i3,i4,i5}) included and {thresh}:', 
                                  f' {f1:.2f}')
                            print(f'---------')

Confusion matrix for ((0, 0, 0, 0, 0)) included and 0:
  [[1093    0]
 [ 119    0]]
Accuracy for ((0, 0, 0, 0, 0)) included and 0:  0.90
Recall for ((0, 0, 0, 0, 0)) included and 0:  0.00
Precision for ((0, 0, 0, 0, 0)) included and 0:  0.00
F1 for ((0, 0, 0, 0, 0)) included and 0:  0.47
---------
Confusion matrix for ((0, 0, 0, 0, 1)) included and 0:
  [[1075   18]
 [  15  104]]
Accuracy for ((0, 0, 0, 0, 1)) included and 0:  0.97
Recall for ((0, 0, 0, 0, 1)) included and 0:  0.87
Precision for ((0, 0, 0, 0, 1)) included and 0:  0.85
F1 for ((0, 0, 0, 0, 1)) included and 0:  0.92
---------
Confusion matrix for ((0, 0, 1, 0, 0)) included and 0:
  [[1070   23]
 [   1  118]]
Accuracy for ((0, 0, 1, 0, 0)) included and 0:  0.98
Recall for ((0, 0, 1, 0, 0)) included and 0:  0.99
Precision for ((0, 0, 1, 0, 0)) included and 0:  0.84
F1 for ((0, 0, 1, 0, 0)) included and 0:  0.95
---------


# Testing on Test Data I Using Best Hyperparameters

## Import Test Data

In [12]:
# read the test data of data I from datasets folder
!python3 pca_pipeline_combine_gridid.py 200 '../datasets/test/data_heatmap_test.csv'

pca_test = pd.read_csv('temp/pca_df.csv')
pca_test = pca_test.iloc[:,1:]
y_test = pca_test[['label']].values

Nb components:  200
Data directory:  ../datasets/test/data_heatmap_test.csv


## One Class SVM

In [13]:
# Best Model is with 172 PCA dimensions
pca_dim = 172

X_train = pca_df_inp.iloc[:,:pca_dim]
X_test = pca_test.iloc[:,:pca_dim]

oneclass = OneClassSVM(gamma = 'auto').fit(X_train)
oneclass_labels = oneclass.predict(X_test)
    
oneclass_labels = np.where(oneclass_labels == 1, 0,1)
conf_mat   = confusion_matrix(y_test,oneclass_labels)
acc        = accuracy_score(y_test,oneclass_labels)
recall     = recall_score(y_test,oneclass_labels)
prec       = precision_score(y_test,oneclass_labels)
f1         = f1_score(y_test,oneclass_labels,average='macro')

print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
              f' {conf_mat}')
#print(f'Accuracy for {pca_dim} PCA dimensions:', 
#              f' {acc:.2f}')
#print(f'Recall for {pca_dim} PCA dimensions:', 
#              f' {recall:.2f}')
#print(f'Precision for {pca_dim} PCA dimensions:', 
#              f' {prec:.2f}')
print(f'F1 Score for {pca_dim} PCA dimensions:', 
              f' {f1:.2f}')

Confusion matrix for 172 PCA dimensions:
  [[161 309]
 [  0  51]]
F1 Score for 172 PCA dimensions:  0.38


## KNN

In [14]:
# Perform PCA on X_train and X_test with 10 dimensions
PCA = 10
nn = 10
thres = 90

X_train = pca_df_inp.iloc[:, :PCA]  
X_test = pca_test.iloc[:, :PCA] 

# Train k-NN model on X_train_pca with 10 neighbors
knn = NearestNeighbors(n_neighbors=nn, algorithm='auto', metric='euclidean')
knn.fit(X_train)

# Predict anomalies on X_test_pca
distances, indices = knn.kneighbors(X_test)
anomaly_scores = distances.mean(axis=1)
threshold = np.percentile(anomaly_scores, thres)

# Convert labels to binary (0: inliers, 1: outliers)
knn_labels = np.where(anomaly_scores > threshold, 1, 0)

# Evaluate the performance on X_test
conf_mat = confusion_matrix(y_test, knn_labels)
acc = accuracy_score(y_test, knn_labels)
f1 = f1_score(y_test, knn_labels,average='macro')

print(f'Confusion matrix on X_test:\n{conf_mat}')
print(f'Accuracy on X_test: {acc:.2f}')
print(f'F1 Score on X_test: {f1:.2f}')

Confusion matrix on X_test:
[[461   9]
 [  8  43]]
Accuracy on X_test: 0.97
F1 Score on X_test: 0.91


## KMeans

In [15]:
pca = 2
k = 2

X = pca_df_inp.iloc[:,:pca]
kmeans = KMeans(n_clusters = k, random_state=42)
kmeans.fit(X)
kmeans_labels = kmeans.predict(pca_test.iloc[:,:pca])
conf_mat = confusion_matrix(y_test,kmeans_labels)
acc = accuracy_score(y_test,kmeans_labels)
recall = recall_score(y_test,kmeans_labels)
prec = precision_score(y_test,kmeans_labels)
f1 = f1_score(y_test,kmeans_labels,average='macro')
print(conf_mat)
print(f'Test accuracy: {acc:.2f}')
print(f'Test precision: {prec:.2f}')
print(f'Test recall: {recall:.2f}')
print(f'Test f1-score: {f1:.2f}')

[[245 225]
 [ 13  38]]
Test accuracy: 0.54
Test precision: 0.14
Test recall: 0.75
Test f1-score: 0.46


## DBSCAN

In [17]:
PCA = 8
eps = 5
min_sam = 10

# Apply PCA with 8 dimensions
X_train = pca_df_inp.iloc[:, :PCA]  
X_test = pca_test.iloc[:, :PCA] 

# Fit DBSCAN model
dbscan = DBSCAN(eps=eps, min_samples=min_sam)
dbscan_labels_train = dbscan.fit_predict(X_train)
dbscan_labels_test = dbscan.fit_predict(X_test)

# Convert labels to binary (0: inliers, 1: outliers)
dbscan_labels_train = np.where(dbscan_labels_train >= 0, 0, 1)
dbscan_labels_test = np.where(dbscan_labels_test >= 0, 0, 1)

# Evaluate performance on test data
conf_mat_test = confusion_matrix(y_test, dbscan_labels_test)
acc_test = accuracy_score(y_test, dbscan_labels_test)
f1_test = f1_score(y_test, dbscan_labels_test,average='macro')

print("\nPerformance on test data:")
print("Confusion matrix:")
print(conf_mat_test)
print(f"Accuracy: {acc_test:.2f}")
print(f"F1 Score: {f1_test:.2f}")


Performance on test data:
Confusion matrix:
[[430  40]
 [  0  51]]
Accuracy: 0.92
F1 Score: 0.84


## Isolation Forest

In [18]:
# Best Model for Isolation Forest is with 10 PCA and n_estimators = 150
pca_dim = 10
best_n_est = 150
X_train = pca_df_inp.iloc[:,:pca_dim]
X_test = pca_test.iloc[:,:pca_dim]
            
isolation_forest = IsolationForest(n_estimators=best_n_est, 
                                   max_samples='auto', 
                                   random_state=42)

model = isolation_forest.fit(X_train)
if_labels = model.predict(X_test)
if_labels = np.where(if_labels == -1, 1, 0)

conf_mat   = confusion_matrix(y_test,if_labels)
acc        = accuracy_score(y_test,if_labels)
recall     = recall_score(y_test,if_labels)
prec       = precision_score(y_test,if_labels)
f1         = f1_score(y_test, if_labels,average='macro')


print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
                      f' {conf_mat}')
print(f'F1 Score for {pca_dim} PCA dimensions with n_estimator = {best_n_est}:', 
                      f' {f1:.2f}')

Confusion matrix for 10 PCA dimensions:
  [[464   6]
 [  8  43]]
F1 Score for 10 PCA dimensions with n_estimator = 150:  0.92


## Best Combine

In [19]:
best_acc = -float('inf')
best_dim = -1
best_recall = -1
best_prec = -1
best_f1 = -1

for i1 in [0,1]:
    for i2 in [0,1]:
        for i3 in [0,1]:
            for i4 in [0,1]:
                for i5 in [0,1]:
                    for thresh in [0,1]:
                        tot = i1+i2+i3+i4+i5
                        comb_labels = i1*if_labels + i2*oneclass_labels + i3*dbscan_labels_test + \
                                        i4*kmeans_labels + i5*knn_labels
                        comb_labels = np.where(comb_labels > tot//2 + thresh, 1, 0)

                        conf_mat   = confusion_matrix(y_test,comb_labels)
                        acc        = accuracy_score(y_test,comb_labels)
                        recall     = recall_score(y_test,comb_labels)
                        prec       = precision_score(y_test,comb_labels)
                        f1         = f1_score(y_test,comb_labels,average='macro')

                        if acc > best_acc:
                            best_acc = acc
                            best_recall  = recall
                            best_prec = prec
                            best_conf_mat = conf_mat
                            best_f1 = f1
print(f'Confusion matrix for ({i1,i2,i3,i4,i5}) included and {thresh}:\n', f' {conf_mat}')
print(f'Best f1 score: {best_f1:.2f}')

Confusion matrix for ((1, 1, 1, 1, 1)) included and 1:
  [[465   5]
 [  6  45]]
Best f1 score: 0.94
