In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# be careful with that:
import warnings
warnings.filterwarnings('ignore')

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures


from sklearn.linear_model import LogisticRegression


from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

from sklearn.ensemble import IsolationForest


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, confusion_matrix, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, adjusted_rand_score, precision_recall_curve, precision_score, f1_score

from sklearn.svm import OneClassSVM

from scipy.stats import uniform, chisquare, binomtest

from itertools import product

from IPython.display import clear_output


# GRID ID = 333346

In [2]:
# Train Data
!python3 pca_pipeline.py 333346 200 '../datasets/train/data_heatmap_train.csv'
grid_id = 333346

import pandas as pd
pca_df = pd.read_csv('temp/pca_df.csv')
normalized_input = pd.read_csv('temp/normalized_input.csv')
#X_train = normalized_input.iloc[:,2:-1]
#y_train = normalized_input.iloc[:,-1]
pca_df = pca_df.iloc[:,1:]
pca_df_inp = pca_df.iloc[:,:-2]

Grid ID:  333346
Nb components:  200
Data directory:  ../datasets/train/data_heatmap_train.csv


In [3]:
# Test Data
!python3 pca_pipeline.py 333346 200 '../datasets/test/data_heatmap_test.csv'
grid_id = 333346

import pandas as pd
pca_df_test = pd.read_csv('temp/pca_df.csv')
normalized_input_test = pd.read_csv('temp/normalized_input.csv')
#X_train = normalized_input.iloc[:,2:-1]
#y_train = normalized_input.iloc[:,-1]

pca_df_test = pca_df_test.iloc[:,1:]
pca_df_inp_test = pca_df_test.iloc[:,:-2]

Grid ID:  333346
Nb components:  200
Data directory:  ../datasets/test/data_heatmap_test.csv


In [4]:
#acutal label
y = pca_df_test[['label']].values
#y = pca_df_test['label'].values
#y = pca_df[['label']]

## One Class SVM

In [5]:
# Best Model is with 85 PCA dimensions
pca_dim = 85

X_train = pca_df_inp.iloc[:,:pca_dim]
X_test = pca_df_inp_test.iloc[:,:pca_dim]

oneclass = OneClassSVM(gamma = 'auto').fit(X_train)
oneclass_labels = oneclass.predict(X_test)
    
oneclass_labels = np.where(oneclass_labels == 1, 0,1)
conf_mat   = confusion_matrix(y,oneclass_labels)
acc        = accuracy_score(y,oneclass_labels)
recall     = recall_score(y,oneclass_labels)
prec       = precision_score(y,oneclass_labels)
f1         = f1_score(y,oneclass_labels)

print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
              f' {conf_mat}')
#print(f'Accuracy for {pca_dim} PCA dimensions:', 
#              f' {acc:.2f}')
#print(f'Recall for {pca_dim} PCA dimensions:', 
#              f' {recall:.2f}')
#print(f'Precision for {pca_dim} PCA dimensions:', 
#              f' {prec:.2f}')
print(f'F1 Score for {pca_dim} PCA dimensions:', 
              f' {f1:.2f}')

Confusion matrix for 85 PCA dimensions:
  [[ 67 158]
 [  0  34]]
F1 Score for 85 PCA dimensions:  0.30


## Isolation Forest

In [6]:
#Best Model for Isolation Forest is with 6 PCA and n_estimators = 50
best_n_est = 50
pca_dim = 6
X_train = pca_df_inp.iloc[:,:pca_dim]
X_test = pca_df_inp_test.iloc[:,:pca_dim]
            
isolation_forest = IsolationForest(n_estimators=best_n_est, 
                                   max_samples='auto', 
                                   random_state=42)

model = isolation_forest.fit(X_train)
if_labels = model.predict(X_test)
if_labels = np.where(if_labels == -1, 1, 0)

conf_mat   = confusion_matrix(y,if_labels)
acc        = accuracy_score(y,if_labels)
recall     = recall_score(y,if_labels)
prec       = precision_score(y,if_labels)
f1         = f1_score(y, if_labels)


print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
                      f' {conf_mat}')
#print(f'Accuracy for {pca_dim} PCA dimensions:', 
#                      f' {acc:.2f}')
#print(f'Recall for {pca_dim} PCA dimensions:', 
#                      f' {recall:.2f}')
#print(f'Precision for {pca_dim} PCA dimensions:', 
#                      f' {prec:.2f}')
print(f'F1 Score for {pca_dim} PCA dimensions with n_estimator = {best_n_est}:', 
                      f' {f1:.2f}')
print(f'---------')

Confusion matrix for 6 PCA dimensions:
  [[223   2]
 [  1  33]]
F1 Score for 6 PCA dimensions with n_estimator = 50:  0.96
---------


# GRID ID = 333519

In [7]:
!python3 pca_pipeline.py 333519 200 '../datasets/train/data_heatmap_train.csv'
grid_id = 333519

import pandas as pd
pca_df = pd.read_csv('temp/pca_df.csv')
normalized_input = pd.read_csv('temp/normalized_input.csv')
# X_train = normalized_input.iloc[:,2:-1]
# y_train = normalized_input.iloc[:,-1]

pca_df = pca_df.iloc[:,1:]
pca_df_inp = pca_df.iloc[:,:-2]

Grid ID:  333519
Nb components:  200
Data directory:  ../datasets/train/data_heatmap_train.csv


In [9]:
# Test Data
!python3 pca_pipeline.py 333519 200 '../datasets/test/data_heatmap_test.csv'
grid_id = 333519

import pandas as pd
pca_df_test = pd.read_csv('temp/pca_df.csv')
normalized_input_test = pd.read_csv('temp/normalized_input.csv')
#X_train = normalized_input.iloc[:,2:-1]
#y_train = normalized_input.iloc[:,-1]

pca_df_test = pca_df_test.iloc[:,1:]
pca_df_inp_test = pca_df_test.iloc[:,:-2]

Grid ID:  333519
Nb components:  200
Data directory:  ../datasets/test/data_heatmap_test.csv


In [11]:
#acutal label
y = pca_df_test[['label']].values
#y = pca_df_test['label'].values
#y = pca_df[['label']]

## One Class SVM

In [12]:
# Best Model is with 163 PCA dimensions
# Best Model is with 85 PCA dimensions
pca_dim = 85

X_train = pca_df_inp.iloc[:,:pca_dim]
X_test = pca_df_inp_test.iloc[:,:pca_dim]

oneclass = OneClassSVM(gamma = 'auto').fit(X_train)
oneclass_labels = oneclass.predict(X_test)
    
oneclass_labels = np.where(oneclass_labels == 1, 0,1)
conf_mat   = confusion_matrix(y,oneclass_labels)
acc        = accuracy_score(y,oneclass_labels)
recall     = recall_score(y,oneclass_labels)
prec       = precision_score(y,oneclass_labels)
f1         = f1_score(y,oneclass_labels)

print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
              f' {conf_mat}')
#print(f'Accuracy for {pca_dim} PCA dimensions:', 
#              f' {acc:.2f}')
#print(f'Recall for {pca_dim} PCA dimensions:', 
#              f' {recall:.2f}')
#print(f'Precision for {pca_dim} PCA dimensions:', 
#              f' {prec:.2f}')
print(f'F1 Score for {pca_dim} PCA dimensions:', 
              f' {f1:.2f}')

Confusion matrix for 85 PCA dimensions:
  [[ 61 184]
 [  0  17]]
F1 Score for 85 PCA dimensions:  0.16


## Isolation Forest

In [13]:
#Best Model for Isolation Forest is with 10 PCA and n_estimators = 100
best_n_est = 100
pca_dim = 10
X_train = pca_df_inp.iloc[:,:pca_dim]
X_test = pca_df_inp_test.iloc[:,:pca_dim]
            
isolation_forest = IsolationForest(n_estimators=best_n_est, 
                                   max_samples='auto', 
                                   random_state=42)

model = isolation_forest.fit(X_train)
if_labels = model.predict(X_test)
if_labels = np.where(if_labels == -1, 1, 0)

conf_mat   = confusion_matrix(y,if_labels)
acc        = accuracy_score(y,if_labels)
recall     = recall_score(y,if_labels)
prec       = precision_score(y,if_labels)
f1         = f1_score(y, if_labels)


print(f'Confusion matrix for {pca_dim} PCA dimensions:\n', 
                      f' {conf_mat}')
#print(f'Accuracy for {pca_dim} PCA dimensions:', 
#                      f' {acc:.2f}')
#print(f'Recall for {pca_dim} PCA dimensions:', 
#                      f' {recall:.2f}')
#print(f'Precision for {pca_dim} PCA dimensions:', 
#                      f' {prec:.2f}')
print(f'F1 Score for {pca_dim} PCA dimensions with n_estimator = {best_n_est}:', 
                      f' {f1:.2f}')
print(f'---------')

Confusion matrix for 10 PCA dimensions:
  [[225  20]
 [  1  16]]
F1 Score for 10 PCA dimensions with n_estimator = 100:  0.60
---------
