## Unsupervised Anomaly Detection
: Before taking semi-supervised learning(SSL), using classical(*shallow*) methods for unsupervised model(One-class SVM and IsolationForest)

In [1]:
# # for colab
# from google.colab import drive
# drive.mount('/content/Mydrive')
# %cd "/content/Mydrive/MyDrive/Github/dacon_FraudDetection"

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [3]:
# Data Load
df_train = pd.read_csv("dataset/train.csv", index_col = 0)
df_val = pd.read_csv("dataset/val.csv", index_col = 0)
df_test = pd.read_csv("dataset/test.csv", index_col = 0)
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(113842, 30)
(28462, 31)
(142503, 30)


In [4]:
# StandardScaler
scaler = StandardScaler()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
train = scaler.fit_transform(df_train)
valid = scaler.fit_transform(df_val.iloc[:,:30])
valid_label = df_val['Class']
# label 변환
valid_label = valid_label.replace(1, -1)
valid_label = valid_label.replace(0, 1)
valid_label.value_counts() # normal : 1, anomaly : -1

 1    28432
-1       30
Name: Class, dtype: int64

### Gridsearch with each model
기존 GridSearchCV 모듈은 unsupervised setting에 적합하지 않으므로(Cross validation이 불가능하므로) labeled data를 이용하여 f1_score을 계산하는 방식 고안 필요

In [4]:
# gridsearch
import itertools
from sklearn.metrics import f1_score

def gridsearch(model, param_grid, x_test, y_test):
    '''
    model : scikit-learn model name(str),
    param_grid : dict form of grid parameters
    x_test, y_test : scoring data and label
    '''
    keys, values = zip(*param_grid.items())
    permutations_dicts = [dict(zip(keys,v)) for v in itertools.product(*values)]
    
    res = [] # 결과 저장할 double list
    for i in range(len(permutations_dicts)):
      param = permutations_dicts[i]
      model_p = globals()[model](**param)
      model_p.fit(train)
      predict_label = model_p.predict(x_test)
      score = f1_score(y_true=y_test, y_pred=predict_label, average='macro') # macro f1 score
      res.append(list(param.values())+[score.round(3)])
      print(f"{i}th search completed!")

    df = pd.DataFrame(res, columns=list(param_grid.keys())+['score']) # to dataframe
    
    return df # return result dataframe

Isolation Forest : with gridsearch best score at 0.6

In [None]:
# IsolationForest
param_grid = {
    'contamination' : np.logspace(-1,-5,num = 9),
    'verbose' : [1]
}
res_forest = gridsearch('IsolationForest', param_grid, valid, valid_label)
res_forest.to_csv('res_forest.csv')

In [3]:
pd.read_csv('res_forest.csv')

Unnamed: 0.1,Unnamed: 0,contamination,verbose,score
0,0,0.1,1,0.482
1,1,0.031623,1,0.515
2,2,0.01,1,0.558
3,3,0.003162,1,0.597
4,4,0.001,1,0.598
5,5,0.000316,1,0.581
6,6,0.0001,1,0.5
7,7,3.2e-05,1,0.5
8,8,1e-05,1,0.5


OneClassSVM
: too slow to train(because of too large dataset)
- Thus randomize train data(sampling)
  
> Parameters</br>
> - $\nu$ : user-specified parameter controlling the proportion of outliers and the proportion of support vectors

In [47]:
# OneclassSVM with random sample
from sklearn.svm import OneClassSVM
from sklearn.utils import resample

train_resample = resample(train, n_samples=100000, replace=True, random_state=100)

ocsvm = OneClassSVM(nu=0.01, kernel='rbf', verbose=10)
ocsvm.fit(train_resample)

[LibSVM]

In [48]:
# predict avg. score

res_label = pd.Series(ocsvm.predict(train_resample))
anomaly_idx = res_label[res_label == -1].index.tolist()

print(f"Average predict score for Normals : {np.average(ocsvm.score_samples(np.delete(train_resample, anomaly_idx, axis=0))).round(4)}")
print(f"Average predict score for Anomalies : {np.average(ocsvm.score_samples(train_resample[anomaly_idx])).round(4)}")

Average predict score for Normals : 5.3468
Average predict score for Anomalies : 2.3974


In [49]:
# Test score
y_pred = ocsvm.predict(valid)

print(f"Predict score : {f1_score(valid_label,y_pred, average='macro')}")

Predict score : 0.5307971627348569
