In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/nyc_taxi.csv",low_memory=False)
df.head(3)

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210


# Preprocessing

Labeling the dataset.

In [3]:
anomaly_points = [
        [
            "2014-10-30 15:30:00.000000",
            "2014-11-03 22:30:00.000000"
        ],
        [
            "2014-11-25 12:00:00.000000",
            "2014-11-29 19:00:00.000000"
        ],
        [
            "2014-12-23 11:30:00.000000",
            "2014-12-27 18:30:00.000000"
        ],
        [
            "2014-12-29 21:30:00.000000",
            "2015-01-03 04:30:00.000000"
        ],
        [
            "2015-01-24 20:30:00.000000",
            "2015-01-29 03:30:00.000000"
        ]
]

In [4]:
# Convert the timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Labeling: if anomaly then 1 else 0
df['anomaly'] = 0  # Set default values
for start, end in anomaly_points:
    df.loc[((df['timestamp'] >= start) & (df['timestamp'] <= end)), 'anomaly'] = 1

Expand the timestamp.

In [5]:
df['year'] = df['timestamp'].apply(lambda x : x.year)
df['month'] = df['timestamp'].apply(lambda x : x.month)
df['day'] = df['timestamp'].apply(lambda x : x.day)
df['hour'] = df['timestamp'].apply(lambda x : x.hour)
df['minute'] = df['timestamp'].apply(lambda x : x.minute)

Set the 'timestamp' as index.

In [6]:
#df.index = df['timestamp']
df.drop(['timestamp'], axis=1, inplace=True)

Reorder the columns.

In [7]:
df = df[['year', 'month', 'day', 'hour', 'minute', 'value', 'anomaly']]

# Train Test Split

Clone the dataframe.

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['anomaly'])

In [10]:
df['anomaly'].value_counts()

anomaly
0    9285
1    1035
Name: count, dtype: int64

In [11]:
df_train['anomaly'].value_counts()

anomaly
0    7428
1     828
Name: count, dtype: int64

In [12]:
df_test['anomaly'].value_counts()

anomaly
0    1857
1     207
Name: count, dtype: int64

In [13]:
del df

# Testing

In [14]:
df_train_ori = df_train.copy(deep=True)

In [15]:
# replace value in 10% of rows with NaN
random_indices = np.random.choice(df_train.index, size=round(df_train.shape[0]*0.1), replace=False)
df_train.loc[random_indices, 'value'] = np.NAN

# replace value for all anomaly rows with NaN
df_train.loc[df_train['anomaly'] == 1, 'value'] = np.NAN


In [16]:
df_train[(df_train['anomaly'] == 0) & (df_train['value'].isna())]

Unnamed: 0,year,month,day,hour,minute,value,anomaly
8005,2014,12,14,18,30,,0
3823,2014,9,18,15,30,,0
6570,2014,11,14,21,0,,0
2408,2014,8,20,4,0,,0
5746,2014,10,28,17,0,,0
...,...,...,...,...,...,...,...
6573,2014,11,14,22,30,,0
9044,2015,1,5,10,0,,0
5024,2014,10,13,16,0,,0
3036,2014,9,2,6,0,,0


In [17]:
df_train_ori[(df_train['anomaly'] == 0) & (df_train['value'].isna())]

Unnamed: 0,year,month,day,hour,minute,value,anomaly
8005,2014,12,14,18,30,19533,0
3823,2014,9,18,15,30,15579,0
6570,2014,11,14,21,0,26301,0
2408,2014,8,20,4,0,2733,0
5746,2014,10,28,17,0,16881,0
...,...,...,...,...,...,...,...
6573,2014,11,14,22,30,26834,0
9044,2015,1,5,10,0,13070,0
5024,2014,10,13,16,0,16219,0
3036,2014,9,2,6,0,6988,0


In [18]:
from sklearn.impute import SimpleImputer

In [19]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df_train)

In [29]:
df_train_input = pd.DataFrame(imp.transform(df_train), columns=df_train.columns).astype(int)

In [36]:
df_train_input[df_train_input['anomaly'] == 1]

Unnamed: 0,year,month,day,hour,minute,value,anomaly
6,2015,1,27,6,30,15262,1
10,2014,11,2,17,30,15262,1
23,2014,12,26,0,30,15262,1
35,2014,11,28,14,30,15262,1
41,2014,12,25,5,30,15262,1
...,...,...,...,...,...,...,...
8217,2014,10,30,17,0,15262,1
8224,2014,11,27,23,0,15262,1
8227,2014,12,26,5,0,15262,1
8233,2014,12,25,4,0,15262,1


In [38]:
df_train_ori[df_train_ori['anomaly'] == 1]

Unnamed: 0,year,month,day,hour,minute,value,anomaly
10093,2015,1,27,6,30,107,1
5987,2014,11,2,17,30,19243,1
8545,2014,12,26,0,30,5312,1
7229,2014,11,28,14,30,18774,1
8507,2014,12,25,5,30,1756,1
...,...,...,...,...,...,...,...
5842,2014,10,30,17,0,16382,1
7198,2014,11,27,23,0,12592,1
8554,2014,12,26,5,0,1459,1
8504,2014,12,25,4,0,3206,1


In [None]:
import pandas as pd
import numpy as np

data = {
    'score': [10, 20, 30, 40, 50],
    'anomaly': [False, False, False, False, False]
}
df = pd.DataFrame(data)

random_indices = np.random.choice(df.index, size=df.shape[0]//4, replace=False)
df.loc[random_indices, 'score'] = 0

print(df)


In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
SimpleImputer()
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))

In [None]:


import scipy.stats as stats
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score
from missingpy import MissForest
import sys
import sklearn.neighbors._base

sys.modules["sklearn.neighbors.base"] = sklearn.neighbors._base
from sklearn.impute import SimpleImputer, KNNImputer

In [None]:
df['year'] = df['timestamp'].apply(lambda x : x.year)
df['month'] = df['timestamp'].apply(lambda x : x.month)
df['day'] = df['timestamp'].apply(lambda x : x.day)
df['hour'] = df['timestamp'].apply(lambda x : x.hour)
df['minute'] = df['timestamp'].apply(lambda x : x.minute)

In [None]:
df.index = df['timestamp']
df.drop(['timestamp'], axis=1, inplace=True)
df.loc[df['anomaly'] == 1]

In [None]:
hotelling_df = pd.DataFrame()
hotelling_df['value'] = df['value']
mean = hotelling_df['value'].mean()
std = hotelling_df['value'].std()
hotelling_df['anomaly_score'] = [((x - mean)/std) ** 2 for x in hotelling_df['value']]
hotelling_df['anomaly_threshold'] = stats.chi2.ppf(q=0.8, df=1)
hotelling_df['anomaly']  = hotelling_df.apply(lambda x : 1 if x['anomaly_score'] > x['anomaly_threshold'] else 0, axis=1)
hotelling_f1 = f1_score(df['anomaly'], hotelling_df['anomaly'])
print(f'Hotelling\'s T2 F1 Score : {hotelling_f1}')

In [None]:
ocsvm_model = OneClassSVM(nu=0.2, gamma=0.001, kernel='rbf')
ocsvm_ret = ocsvm_model.fit_predict(df['value'].values.reshape(-1, 1))
ocsvm_df = pd.DataFrame()
ocsvm_df['value'] = df['value']
ocsvm_df['anomaly']  = [1 if i==-1 else 0 for i in ocsvm_ret]
ocsvm_f1 = f1_score(df['anomaly'], ocsvm_df['anomaly'])
print(f'One-Class SVM F1 Score : {ocsvm_f1}')

In [None]:
import numpy as np
miss_data = df.copy()
timestamp = miss_data.index
miss_data.loc[miss_data["anomaly"] == 1, "value"] = np.nan
miss_data.loc[miss_data["anomaly"] == 1]

In [None]:
def RMSE(original, filled):
    from sklearn.metrics import mean_squared_error

    score = np.sqrt(mean_squared_error(original, filled))
    return score


def MAE(original, filled):
    from sklearn.metrics import mean_absolute_error

    score = mean_absolute_error(original, filled)
    return score


def MAPE(original, filled):
    from sklearn.metrics import mean_absolute_percentage_error

    score = mean_absolute_percentage_error(original, filled)
    return score

def metric_calc(X_filled, complete_data):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaled_complete_data = scaler.fit_transform(complete_data)
    scaled_X_filled = scaler.fit_transform(X_filled)

    rmse = RMSE(scaled_complete_data, scaled_X_filled)
    print("RMSE=", rmse)

    mae = MAE(scaled_complete_data, scaled_X_filled)
    print("MAE=", mae)

    mape = MAPE(scaled_complete_data, scaled_X_filled)
    print("MAPE=", mape)

In [None]:
mf = MissForest()

X_filled = mf.fit_transform(miss_data)
X_filled = pd.DataFrame(X_filled, columns = ['value', 'anomaly', 'year', 'month', 'day', 'hour', 'minute'])
X_filled.index = timestamp
# print(X_filled)

metric_calc(df, X_filled)

In [None]:
X_filled = pd.DataFrame(miss_data).interpolate(
            method="linear", limit_direction="both"
        )
X_filled.index = timestamp
# print(X_filled)

metric_calc(df, X_filled)

In [None]:
imp = KNNImputer()
X_filled = imp.fit_transform(miss_data)
X_filled = pd.DataFrame(X_filled, columns = ['value', 'anomaly', 'year', 'month', 'day', 'hour', 'minute'])
X_filled.index = timestamp
X_filled

metric_calc(df, X_filled)