In [1]:
import pandas as pd
import numpy as np
from missingpy import MissForest

# Preprocessing

In [2]:
# Read the dataset
df = pd.read_csv("data/nyc_taxi.csv",low_memory=False)
# Convert the timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['year'] = df['timestamp'].dt.year

# Show data sample
df.head(3)

Unnamed: 0,timestamp,value,year
0,2014-07-01 00:00:00,10844,2014
1,2014-07-01 00:30:00,8127,2014
2,2014-07-01 01:00:00,6210,2014


In [3]:
# The times of anomaly events.
anomaly_points = [
        [
            "2014-10-30 15:30:00.000000",
            "2014-11-03 22:30:00.000000"
        ],
        [
            "2014-11-25 12:00:00.000000",
            "2014-11-29 19:00:00.000000"
        ],
        [
            "2014-12-23 11:30:00.000000",
            "2014-12-27 18:30:00.000000"
        ],
        [
            "2014-12-29 21:30:00.000000",
            "2015-01-03 04:30:00.000000"
        ],
        [
            "2015-01-24 20:30:00.000000",
            "2015-01-29 03:30:00.000000"
        ]
]

In [4]:
# Labeling: if anomaly then 1 else 0
df['anomaly'] = 0  # Set default values
for start, end in anomaly_points:
    df.loc[((df['timestamp'] >= start) & (df['timestamp'] <= end)), 'anomaly'] = 1

## Time Series Split

In [5]:
df.head()

Unnamed: 0,timestamp,value,year,anomaly
0,2014-07-01 00:00:00,10844,2014,0
1,2014-07-01 00:30:00,8127,2014,0
2,2014-07-01 01:00:00,6210,2014,0
3,2014-07-01 01:30:00,4656,2014,0
4,2014-07-01 02:00:00,3820,2014,0


In [6]:
# Calculate the number of rows representing 80% of the DataFrame for training
num_rows = int(0.8 * len(df))

# Get the first 80% of the DataFrame
df_train = df[:num_rows]

# Get the remaining 20% of the DataFrame
df_test = df[num_rows:]

In [15]:
df_train['year'].value_counts()

2014    8256
Name: year, dtype: int64

In [16]:
df_test['year'].value_counts()

2015    1488
2014     576
Name: year, dtype: int64

In [17]:
df_train['anomaly'].value_counts()

0    7842
1     414
Name: anomaly, dtype: int64

In [18]:
df_test['anomaly'].value_counts()

0    1443
1     621
Name: anomaly, dtype: int64

In [7]:
# Delete the original dataset
del df

# Testing

In [8]:
df_train_nan = df_train.copy(deep=True)

df_train_nan.loc[df_train_nan['anomaly'] == 1, 'value'] = np.nan

In [9]:
df_train_nan[df_train_nan['anomaly'] == 1]

Unnamed: 0,timestamp,value,year,anomaly
5839,2014-10-30 15:30:00,,2014,1
5840,2014-10-30 16:00:00,,2014,1
5841,2014-10-30 16:30:00,,2014,1
5842,2014-10-30 17:00:00,,2014,1
5843,2014-10-30 17:30:00,,2014,1
...,...,...,...,...
7282,2014-11-29 17:00:00,,2014,1
7283,2014-11-29 17:30:00,,2014,1
7284,2014-11-29 18:00:00,,2014,1
7285,2014-11-29 18:30:00,,2014,1


In [10]:
X = df_train_nan[['value', 'year']]
X

Unnamed: 0,value,year
0,10844.0,2014
1,8127.0,2014
2,6210.0,2014
3,4656.0,2014
4,3820.0,2014
...,...,...
8251,26403.0,2014
8252,26905.0,2014
8253,26723.0,2014
8254,25807.0,2014


In [19]:
imputer = MissForest(random_state=1337)
df_train_input = pd.DataFrame(imputer.fit_transform(X), columns=['value', 'year'])

Iteration: 0
Iteration: 1
Iteration: 2


In [21]:
df_train.loc[df_train['anomaly'] == 1]

Unnamed: 0,timestamp,value,year,anomaly
5839,2014-10-30 15:30:00,16749,2014,1
5840,2014-10-30 16:00:00,14604,2014,1
5841,2014-10-30 16:30:00,13367,2014,1
5842,2014-10-30 17:00:00,16382,2014,1
5843,2014-10-30 17:30:00,19879,2014,1
...,...,...,...,...
7282,2014-11-29 17:00:00,18684,2014,1
7283,2014-11-29 17:30:00,20891,2014,1
7284,2014-11-29 18:00:00,21554,2014,1
7285,2014-11-29 18:30:00,22678,2014,1


In [22]:
df_train_input.loc[df_train['anomaly'] == 1]

Unnamed: 0,value,year
5839,15427.538924,2014.0
5840,15427.538924,2014.0
5841,15427.538924,2014.0
5842,15427.538924,2014.0
5843,15427.538924,2014.0
...,...,...
7282,15427.538924,2014.0
7283,15427.538924,2014.0
7284,15427.538924,2014.0
7285,15427.538924,2014.0
