In [43]:
import pandas as pd
import scipy.stats as stats
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score
from missingpy import MissForest
import sys
import sklearn.neighbors._base

sys.modules["sklearn.neighbors.base"] = sklearn.neighbors._base
from sklearn.impute import SimpleImputer, KNNImputer

In [44]:
# nyc_taxi, ambient_temperature_system_failure machine_temperature_system_failure, rogue_agent_key_updown


In [45]:
df = pd.read_csv("data/rogue_agent_key_updown.csv",low_memory=False)
df.head(3)

Unnamed: 0,timestamp,value
0,2014-07-06 20:10:00,1.047256
1,2014-07-06 20:15:00,1.848031
2,2014-07-06 20:20:00,1.297892


In [46]:
anomaly_points = [
        [
            "2014-07-14 17:00:00.000000",
            "2014-07-15 15:00:00.000000"
        ],
        [
            "2014-07-16 21:50:00.000000",
            "2014-07-17 19:50:00.000000"
        ]
]

In [47]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
#is anomaly? : True => 1, False => 0
df['anomaly'] = 0
for start, end in anomaly_points:
    df.loc[((df['timestamp'] >= start) & (df['timestamp'] <= end)), 'anomaly'] = 1

In [48]:
df.index = df['timestamp']
df.drop(['timestamp'], axis=1, inplace=True)
df.loc[df['anomaly'] == 1]

Unnamed: 0_level_0,value,anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-07-14 17:00:00,0.000000,1
2014-07-14 17:05:00,0.000000,1
2014-07-14 17:10:00,4.805612,1
2014-07-14 17:15:00,1.546595,1
2014-07-14 17:20:00,1.211436,1
...,...,...
2014-07-17 19:30:00,0.000000,1
2014-07-17 19:35:00,0.000000,1
2014-07-17 19:40:00,0.000000,1
2014-07-17 19:45:00,0.000000,1


In [49]:
ocsvm_model = OneClassSVM(nu=0.2, gamma=0.001, kernel='rbf')
ocsvm_ret = ocsvm_model.fit_predict(df['value'].values.reshape(-1, 1))
ocsvm_df = pd.DataFrame()
ocsvm_df['value'] = df['value']
ocsvm_df['anomaly']  = [1 if i==-1 else 0 for i in ocsvm_ret]
ocsvm_f1 = f1_score(df['anomaly'], ocsvm_df['anomaly'])
print(f'One-Class SVM F1 Score : {ocsvm_f1}')

One-Class SVM F1 Score : 0.1879120879120879


In [50]:
import numpy as np
miss_data = ocsvm_df.copy()
timestamp = miss_data.index
miss_data.loc[miss_data["anomaly"] == 1, "value"] = np.nan
miss_data.loc[miss_data["anomaly"] == 1]

Unnamed: 0_level_0,value,anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-07-06 20:35:00,,1
2014-07-06 20:40:00,,1
2014-07-06 20:45:00,,1
2014-07-06 20:50:00,,1
2014-07-06 21:00:00,,1
...,...,...
2014-07-25 08:25:00,,1
2014-07-25 08:35:00,,1
2014-07-25 08:40:00,,1
2014-07-25 08:45:00,,1


In [51]:
X_filled = miss_data.copy()
X_filled = pd.DataFrame(X_filled).interpolate(
            method="linear", limit_direction="both"
        )
X_filled.index = timestamp
# print(df['value'])
# print(X_filled['value'])

y = pd.DataFrame({'y_true':df['value'],
'y_pred':X_filled['value']})
# y = metric_calc(y)
# y
X_filled['value']

timestamp
2014-07-06 20:10:00    1.047256
2014-07-06 20:15:00    1.848031
2014-07-06 20:20:00    1.297892
2014-07-06 20:25:00    1.593151
2014-07-06 20:35:00    1.454461
                         ...   
2014-07-25 08:35:00    1.093674
2014-07-25 08:40:00    1.194987
2014-07-25 08:45:00    1.296301
2014-07-25 08:50:00    1.397614
2014-07-25 08:55:00    1.397614
Name: value, Length: 5315, dtype: float64

In [52]:
ocsvm_model = OneClassSVM(nu=0.2, gamma=0.001, kernel='rbf')
ocsvm_ret = ocsvm_model.fit_predict(X_filled['value'].values.reshape(-1, 1))
ocsvm_df = pd.DataFrame()
ocsvm_df['value'] = X_filled['value']
ocsvm_df['anomaly']  = [1 if i==-1 else 0 for i in ocsvm_ret]
ocsvm_f1 = f1_score(df['anomaly'], ocsvm_df['anomaly'])
print(f'One-Class SVM F1 Score : {ocsvm_f1}')

One-Class SVM F1 Score : 0.08286252354048965
