In [160]:
import pandas as pd
import scipy.stats as stats
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score
from missingpy import MissForest
import sys
import sklearn.neighbors._base

sys.modules["sklearn.neighbors.base"] = sklearn.neighbors._base
from sklearn.impute import SimpleImputer, KNNImputer

In [161]:
# nyc_taxi, ambient_temperature_system_failure machine_temperature_system_failure, rogue_agent_key_updown


In [162]:
df = pd.read_csv("data/nyc_taxi.csv",low_memory=False)
df.head(3)

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210


In [163]:
anomaly_points = [
        [
            "2014-10-30 15:30:00.000000",
            "2014-11-03 22:30:00.000000"
        ],
        [
            "2014-11-25 12:00:00.000000",
            "2014-11-29 19:00:00.000000"
        ],
        [
            "2014-12-23 11:30:00.000000",
            "2014-12-27 18:30:00.000000"
        ],
        [
            "2014-12-29 21:30:00.000000",
            "2015-01-03 04:30:00.000000"
        ],
        [
            "2015-01-24 20:30:00.000000",
            "2015-01-29 03:30:00.000000"
        ]
] 

In [164]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
#is anomaly? : True => 1, False => 0
df['anomaly'] = 0
for start, end in anomaly_points:
    df.loc[((df['timestamp'] >= start) & (df['timestamp'] <= end)), 'anomaly'] = 1

In [165]:
df.index = df['timestamp']
df.drop(['timestamp'], axis=1, inplace=True)
df.loc[df['anomaly'] == 1]

Unnamed: 0_level_0,value,anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-10-30 15:30:00,16749,1
2014-10-30 16:00:00,14604,1
2014-10-30 16:30:00,13367,1
2014-10-30 17:00:00,16382,1
2014-10-30 17:30:00,19879,1
...,...,...
2015-01-29 01:30:00,4342,1
2015-01-29 02:00:00,3604,1
2015-01-29 02:30:00,2822,1
2015-01-29 03:00:00,2379,1


In [166]:
ocsvm_model = OneClassSVM(nu=0.2, gamma=0.001, kernel='rbf')
ocsvm_ret = ocsvm_model.fit_predict(df['value'].values.reshape(-1, 1))
ocsvm_df = pd.DataFrame()
ocsvm_df['value'] = df['value']
ocsvm_df['anomaly']  = [1 if i==-1 else 0 for i in ocsvm_ret]
ocsvm_f1 = f1_score(df['anomaly'], ocsvm_df['anomaly'])
print(f'One-Class SVM F1 Score : {ocsvm_f1}')

One-Class SVM F1 Score : 0.17431018078020935


In [167]:
import numpy as np
np.random.seed(1234)
# miss_data = ocsvm_df.copy()
# timestamp = miss_data.index
# miss_data.loc[miss_data["anomaly"] == 1, "value"] = np.nan
# miss_data.loc[miss_data["anomaly"] == 1]

miss_data = df.copy()
miss_data = miss_data.mask(np.random.random(df.shape) < .5)
miss_data.loc[miss_data["anomaly"] == 1]

Unnamed: 0_level_0,value,anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-10-30 17:00:00,,1.0
2014-10-30 17:30:00,,1.0
2014-10-30 18:00:00,,1.0
2014-10-30 19:00:00,,1.0
2014-10-30 20:00:00,,1.0
...,...,...
2015-01-28 23:30:00,12535.0,1.0
2015-01-29 00:00:00,10134.0,1.0
2015-01-29 01:00:00,5619.0,1.0
2015-01-29 03:00:00,2379.0,1.0


In [168]:
def RMSE(original, filled):
    from sklearn.metrics import mean_squared_error

    score = np.sqrt(mean_squared_error(original, filled))

    return score


def MAE(original, filled):
    from sklearn.metrics import mean_absolute_error

    score = mean_absolute_error(original, filled)
    return score


def MAPE(original, filled):
    from sklearn.metrics import mean_absolute_percentage_error

    score = mean_absolute_percentage_error(original, filled)
    return score

def metric_calc(y):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()

    y[['y_true', 'y_pred']] = scaler.fit_transform(y[['y_true', 'y_pred']])
    y_true = y['y_true']
    y_pred = y['y_pred']

    rmse = RMSE(y_true, y_pred)
    print("RMSE=", rmse)

    mae = MAE(y_true, y_pred)
    print("MAE=", mae)

    mape = MAPE(y_true, y_pred)
    print("MAPE=", mape)

In [169]:
X_filled = miss_data.copy()
X_filled = pd.DataFrame(X_filled).interpolate(
            method="linear", limit_direction="both"
        )
X_filled.index = timestamp
# print(df['value'])
# print(X_filled['value'])

y = pd.DataFrame({'y_true':df['value'],
'y_pred':X_filled['value']})
y = metric_calc(y)
y
# X_filled.loc[miss_data["anomaly"] == 1]

RMSE= 0.12375458654971278
MAE= 0.11265239910821263
MAPE= 0.32629942325181976


In [170]:
ocsvm_model = OneClassSVM(nu=0.2, gamma=0.001, kernel='rbf')
ocsvm_ret = ocsvm_model.fit_predict(X_filled['value'].values.reshape(-1, 1))
ocsvm_df = pd.DataFrame()
ocsvm_df['value'] = X_filled['value']
ocsvm_df['anomaly']  = [1 if i==-1 else 0 for i in ocsvm_ret]
ocsvm_f1 = f1_score(df['anomaly'], ocsvm_df['anomaly'])
print(f'One-Class SVM F1 Score : {ocsvm_f1}')

One-Class SVM F1 Score : 0.17509800261340303
