In [1]:
import numpy as np
import pandas as pd
from missingpy import MissForest
from sklearn.impute import KNNImputer, SimpleImputer

# Reading Dataset

In [2]:
# Read the dataset
raw = pd.read_csv("data/nyc_taxi.csv",low_memory=False)
raw['timestamp'] = pd.to_datetime(raw['timestamp'])

# Preview raw dataset
raw

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820
...,...,...
10315,2015-01-31 21:30:00,24670
10316,2015-01-31 22:00:00,25721
10317,2015-01-31 22:30:00,27309
10318,2015-01-31 23:00:00,26591


In [3]:
# The times of anomaly events (Ground Truth)
anomaly_points = [
        [
            "2014-10-30 15:30:00.000000",
            "2014-11-03 22:30:00.000000"
        ],
        [
            "2014-11-25 12:00:00.000000",
            "2014-11-29 19:00:00.000000"
        ],
        [
            "2014-12-23 11:30:00.000000",
            "2014-12-27 18:30:00.000000"
        ],
        [
            "2014-12-29 21:30:00.000000",
            "2015-01-03 04:30:00.000000"
        ],
        [
            "2015-01-24 20:30:00.000000",
            "2015-01-29 03:30:00.000000"
        ]
]

# Labeling: if anomaly then 1 else 0
raw['anomaly'] = 0  # Set default values
for start, end in anomaly_points:
    raw.loc[((raw['timestamp'] >= start) & (raw['timestamp'] <= end)), 'anomaly'] = 1

# Preview labeled raw dataset
raw

Unnamed: 0,timestamp,value,anomaly
0,2014-07-01 00:00:00,10844,0
1,2014-07-01 00:30:00,8127,0
2,2014-07-01 01:00:00,6210,0
3,2014-07-01 01:30:00,4656,0
4,2014-07-01 02:00:00,3820,0
...,...,...,...
10315,2015-01-31 21:30:00,24670,0
10316,2015-01-31 22:00:00,25721,0
10317,2015-01-31 22:30:00,27309,0
10318,2015-01-31 23:00:00,26591,0


In [4]:
raw['anomaly'].value_counts()

0    9285
1    1035
Name: anomaly, dtype: int64

# Preprocessing

In [5]:
# Convert the timestamp
df = pd.DataFrame()
df['year'] = raw['timestamp'].dt.year
df['month'] = raw['timestamp'].dt.month
df['day'] = raw['timestamp'].dt.day
df['hour'] = raw['timestamp'].dt.hour
df['value'] = raw['value']
df['anomaly'] = raw['anomaly']

# delete unused dataframe
del raw

# Preview dataset
df

Unnamed: 0,year,month,day,hour,value,anomaly
0,2014,7,1,0,10844,0
1,2014,7,1,0,8127,0
2,2014,7,1,1,6210,0
3,2014,7,1,1,4656,0
4,2014,7,1,2,3820,0
...,...,...,...,...,...,...
10315,2015,1,31,21,24670,0
10316,2015,1,31,22,25721,0
10317,2015,1,31,22,27309,0
10318,2015,1,31,23,26591,0


In [6]:
# Calculate the number of rows representing 80% of the DataFrame for training
num_rows = int(0.8 * len(df))

# Get the first 80% of the DataFrame
df_train = df[:num_rows]

# Get the remaining 20% of the DataFrame
df_test = df[num_rows:]

# delete unused dataframe
del df

In [7]:
df_train['anomaly'].value_counts()

0    7842
1     414
Name: anomaly, dtype: int64

In [8]:
df_test['anomaly'].value_counts()

0    1443
1     621
Name: anomaly, dtype: int64

In [9]:
# (1) Get normal training data, (2) Drop 'anomaly' column, and (3) convert it to float
X_train_normal = df_train[df_train['anomaly'] == 0].drop(columns=['anomaly']).to_numpy(dtype=float)
# Copy the 'value' column of normal training data
y_train_normal = X_train_normal[:, -1].copy()

# (1) Get anomaly training data, (2) Drop 'anomaly' column, and (3) convert it to float
X_train_anomaly = df_train[df_train['anomaly'] == 1].drop(columns=['anomaly']).to_numpy(dtype=float)
# Copy the 'value' column of normal anomaly data
y_train_anomaly = X_train_anomaly[:, -1].copy()


In [10]:
df_train[df_train['anomaly'] == 0].drop(columns=['anomaly'])

Unnamed: 0,year,month,day,hour,value
0,2014,7,1,0,10844
1,2014,7,1,0,8127
2,2014,7,1,1,6210
3,2014,7,1,1,4656
4,2014,7,1,2,3820
...,...,...,...,...,...
8251,2014,12,19,21,26403
8252,2014,12,19,22,26905
8253,2014,12,19,22,26723
8254,2014,12,19,23,25807


In [11]:
X_train_normal

array([[2.0140e+03, 7.0000e+00, 1.0000e+00, 0.0000e+00, 1.0844e+04],
       [2.0140e+03, 7.0000e+00, 1.0000e+00, 0.0000e+00, 8.1270e+03],
       [2.0140e+03, 7.0000e+00, 1.0000e+00, 1.0000e+00, 6.2100e+03],
       ...,
       [2.0140e+03, 1.2000e+01, 1.9000e+01, 2.2000e+01, 2.6723e+04],
       [2.0140e+03, 1.2000e+01, 1.9000e+01, 2.3000e+01, 2.5807e+04],
       [2.0140e+03, 1.2000e+01, 1.9000e+01, 2.3000e+01, 2.6432e+04]])

In [12]:
y_train_normal

array([10844.,  8127.,  6210., ..., 26723., 25807., 26432.])

# Finding Threshold

In [13]:
# Calculate the number of rows to select (10% of the total rows)
num_rows_to_select = int(0.1 * X_train_normal.shape[0])

# Randomly select row indices to change
random_indices = np.random.choice(X_train_normal.shape[0], size=num_rows_to_select, replace=False)

# Change the value column to NaN for the selected rows
X_train_normal[random_indices, -1] = np.nan

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [15]:
imputer = KNNImputer(n_neighbors=4, weights="uniform")
X_train_imputation = imputer.fit_transform(X_train_normal)

In [23]:
y_train_normal[random_indices]

array([16699., 14612., 24891., 19835., 11392., 13635., 26429.,  4896.,
       17922., 17500., 19812., 24822., 25079.,  2507., 18847., 23444.,
       18250., 18988., 19815., 12187., 23287., 14136.,  2370., 19533.,
       16312., 18835., 17477.,  2726., 18552., 16973., 15872., 12747.,
        9502., 18499., 25036., 23869., 18698., 14421., 23849.,  5653.,
       23913., 22112.,  2346.,  4566., 19045., 15239., 14990.,  4420.,
       26320.,  2268., 15796., 17264., 16292., 15210., 24630., 14548.,
       20534., 17405., 19103.,  2671., 15853., 17563.,  4347., 17287.,
       19871., 18639.,  4622.,  3879., 16106., 23098.,  2515., 16976.,
       12168., 16210., 16707., 13462., 19626., 17732., 17325., 16857.,
       18341.,  7009., 16460., 16558., 15673., 19501.,  5478.,  2583.,
       15675., 16979., 14453., 14755., 17801., 25251., 23408., 19221.,
       21926., 18462., 22624., 19032.,  3093., 15986., 16929., 17302.,
       17534., 19372.,  5087., 26225., 25209., 19243., 18680., 12022.,
      

In [24]:
X_train_imputation[random_indices, -1]

array([17979.75, 14353.5 , 20502.75, 14909.  ,  6439.75, 14716.  ,
       22852.25,  4602.75, 19018.75, 16387.  , 22534.25, 22802.5 ,
       22562.5 ,  3366.75, 17664.25, 21413.75, 17858.  , 20131.25,
       17608.75, 14020.25, 20858.5 , 12610.75,  2394.5 , 20131.25,
       17130.  , 16570.75, 15620.25,  5080.5 , 17240.5 , 17241.25,
       17020.75, 10911.25,  7934.  , 18081.  , 21107.25, 22720.25,
       16906.25, 16622.  , 20613.  ,  5559.5 , 24017.  , 21059.25,
        3665.  ,  8069.75, 16963.75, 14674.5 , 14862.25,  3858.25,
       24606.  ,  3065.5 , 16791.25, 16700.75, 16284.75, 16028.  ,
       24379.25, 22474.75, 22646.5 , 14978.5 , 19367.  ,  8778.75,
       17847.5 , 21004.5 ,  5154.75, 20076.5 , 18796.75, 19170.75,
       10015.5 ,  6224.25, 16184.  , 22685.25,  5665.  , 17051.5 ,
       15864.  , 15800.  , 17997.25, 14765.  , 19369.  , 16123.75,
       17000.25, 17184.75, 22934.5 , 10521.  , 17382.5 , 16891.75,
       16412.5 , 16926.25,  6010.75,  5505.25, 16538.  , 16946