In [16]:

import pandas as pd
import scipy.stats as stats
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score
from missingpy import MissForest
import sys
import sklearn.neighbors._base

sys.modules["sklearn.neighbors.base"] = sklearn.neighbors._base
from sklearn.impute import SimpleImputer, KNNImputer

In [17]:
df = pd.read_csv("data/nyc_taxi.csv",low_memory=False)
df.head(3)

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210


# Preprocessing

In [18]:
anomaly_points = [
        [
            "2014-10-30 15:30:00.000000",
            "2014-11-03 22:30:00.000000"
        ],
        [
            "2014-11-25 12:00:00.000000",
            "2014-11-29 19:00:00.000000"
        ],
        [
            "2014-12-23 11:30:00.000000",
            "2014-12-27 18:30:00.000000"
        ],
        [
            "2014-12-29 21:30:00.000000",
            "2015-01-03 04:30:00.000000"
        ],
        [
            "2015-01-24 20:30:00.000000",
            "2015-01-29 03:30:00.000000"
        ]
]

In [19]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
#is anomaly? : True => 1, False => 0
df['anomaly'] = 0
for start, end in anomaly_points:
    df.loc[((df['timestamp'] >= start) & (df['timestamp'] <= end)), 'anomaly'] = 1

In [20]:
df['year'] = df['timestamp'].apply(lambda x : x.year)
df['month'] = df['timestamp'].apply(lambda x : x.month)
df['day'] = df['timestamp'].apply(lambda x : x.day)
df['hour'] = df['timestamp'].apply(lambda x : x.hour)
df['minute'] = df['timestamp'].apply(lambda x : x.minute)

In [21]:
df.index = df['timestamp']
df.drop(['timestamp'], axis=1, inplace=True)
df.loc[df['anomaly'] == 1]

Unnamed: 0_level_0,value,anomaly,year,month,day,hour,minute
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-10-30 15:30:00,16749,1,2014,10,30,15,30
2014-10-30 16:00:00,14604,1,2014,10,30,16,0
2014-10-30 16:30:00,13367,1,2014,10,30,16,30
2014-10-30 17:00:00,16382,1,2014,10,30,17,0
2014-10-30 17:30:00,19879,1,2014,10,30,17,30
...,...,...,...,...,...,...,...
2015-01-29 01:30:00,4342,1,2015,1,29,1,30
2015-01-29 02:00:00,3604,1,2015,1,29,2,0
2015-01-29 02:30:00,2822,1,2015,1,29,2,30
2015-01-29 03:00:00,2379,1,2015,1,29,3,0


In [22]:
hotelling_df = pd.DataFrame()
hotelling_df['value'] = df['value']
mean = hotelling_df['value'].mean()
std = hotelling_df['value'].std()
hotelling_df['anomaly_score'] = [((x - mean)/std) ** 2 for x in hotelling_df['value']]
hotelling_df['anomaly_threshold'] = stats.chi2.ppf(q=0.8, df=1)
hotelling_df['anomaly']  = hotelling_df.apply(lambda x : 1 if x['anomaly_score'] > x['anomaly_threshold'] else 0, axis=1)
hotelling_f1 = f1_score(df['anomaly'], hotelling_df['anomaly'])
print(f'Hotelling\'s T2 F1 Score : {hotelling_f1}')

Hotelling's T2 F1 Score : 0.15788065293015788


In [23]:
ocsvm_model = OneClassSVM(nu=0.2, gamma=0.001, kernel='rbf')
ocsvm_ret = ocsvm_model.fit_predict(df['value'].values.reshape(-1, 1))
ocsvm_df = pd.DataFrame()
ocsvm_df['value'] = df['value']
ocsvm_df['anomaly']  = [1 if i==-1 else 0 for i in ocsvm_ret]
ocsvm_f1 = f1_score(df['anomaly'], ocsvm_df['anomaly'])
print(f'One-Class SVM F1 Score : {ocsvm_f1}')

One-Class SVM F1 Score : 0.17431018078020935


In [24]:
import numpy as np
miss_data = df.copy()
timestamp = miss_data.index
miss_data.loc[miss_data["anomaly"] == 1, "value"] = np.nan
miss_data.loc[miss_data["anomaly"] == 1]

Unnamed: 0_level_0,value,anomaly,year,month,day,hour,minute
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-10-30 15:30:00,,1,2014,10,30,15,30
2014-10-30 16:00:00,,1,2014,10,30,16,0
2014-10-30 16:30:00,,1,2014,10,30,16,30
2014-10-30 17:00:00,,1,2014,10,30,17,0
2014-10-30 17:30:00,,1,2014,10,30,17,30
...,...,...,...,...,...,...,...
2015-01-29 01:30:00,,1,2015,1,29,1,30
2015-01-29 02:00:00,,1,2015,1,29,2,0
2015-01-29 02:30:00,,1,2015,1,29,2,30
2015-01-29 03:00:00,,1,2015,1,29,3,0


In [36]:
def RMSE(original, filled):
    from sklearn.metrics import mean_squared_error

    score = np.sqrt(mean_squared_error(original, filled))
    return score


def MAE(original, filled):
    from sklearn.metrics import mean_absolute_error

    score = mean_absolute_error(original, filled)
    return score


def MAPE(original, filled):
    from sklearn.metrics import mean_absolute_percentage_error

    score = mean_absolute_percentage_error(original, filled)
    return score

def metric_calc(X_filled, complete_data):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaled_complete_data = scaler.fit_transform(complete_data)
    scaled_X_filled = scaler.fit_transform(X_filled)

    rmse = RMSE(scaled_complete_data, scaled_X_filled)
    print("RMSE=", rmse)

    mae = MAE(scaled_complete_data, scaled_X_filled)
    print("MAE=", mae)

    mape = MAPE(scaled_complete_data, scaled_X_filled)
    print("MAPE=", mape)

In [37]:
mf = MissForest()

X_filled = mf.fit_transform(miss_data)
X_filled = pd.DataFrame(X_filled, columns = ['value', 'anomaly', 'year', 'month', 'day', 'hour', 'minute'])
X_filled.index = timestamp
# print(X_filled)

metric_calc(df, X_filled)



Iteration: 0




Iteration: 1




Iteration: 2




Iteration: 3
RMSE= 0.04624124945361327
MAE= 0.014487455730270774
MAPE= 2263720249.576943


In [38]:
X_filled = pd.DataFrame(miss_data).interpolate(
            method="linear", limit_direction="both"
        )
X_filled.index = timestamp
# print(X_filled)

metric_calc(df, X_filled)

RMSE= 0.051155298806877775
MAE= 0.015499841580911335
MAPE= 2263720249.579615


In [39]:
imp = KNNImputer()
X_filled = imp.fit_transform(miss_data)
X_filled = pd.DataFrame(X_filled, columns = ['value', 'anomaly', 'year', 'month', 'day', 'hour', 'minute'])
X_filled.index = timestamp
X_filled

metric_calc(df, X_filled)

RMSE= 0.04646888031191886
MAE= 0.014562755049574308
MAPE= 2263720249.5771685
