In [1]:
import pandas as pd
import numpy as np
from missingpy import MissForest
from sklearn.impute import KNNImputer, SimpleImputer


# Preprocessing

In [2]:
# Read the dataset
df = pd.read_csv("data/nyc_taxi.csv",low_memory=False)
# Convert the timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day

# Show data sample
df.head(3)

Unnamed: 0,timestamp,value,year,month,day
0,2014-07-01 00:00:00,10844,2014,7,1
1,2014-07-01 00:30:00,8127,2014,7,1
2,2014-07-01 01:00:00,6210,2014,7,1


In [3]:
# The times of anomaly events.
anomaly_points = [
        [
            "2014-10-30 15:30:00.000000",
            "2014-11-03 22:30:00.000000"
        ],
        [
            "2014-11-25 12:00:00.000000",
            "2014-11-29 19:00:00.000000"
        ],
        [
            "2014-12-23 11:30:00.000000",
            "2014-12-27 18:30:00.000000"
        ],
        [
            "2014-12-29 21:30:00.000000",
            "2015-01-03 04:30:00.000000"
        ],
        [
            "2015-01-24 20:30:00.000000",
            "2015-01-29 03:30:00.000000"
        ]
]

In [4]:
# Labeling: if anomaly then 1 else 0
df['anomaly'] = 0  # Set default values
for start, end in anomaly_points:
    df.loc[((df['timestamp'] >= start) & (df['timestamp'] <= end)), 'anomaly'] = 1

## Time Series Split

In [5]:
df.head()

Unnamed: 0,timestamp,value,year,month,day,anomaly
0,2014-07-01 00:00:00,10844,2014,7,1,0
1,2014-07-01 00:30:00,8127,2014,7,1,0
2,2014-07-01 01:00:00,6210,2014,7,1,0
3,2014-07-01 01:30:00,4656,2014,7,1,0
4,2014-07-01 02:00:00,3820,2014,7,1,0


In [6]:
# Calculate the number of rows representing 80% of the DataFrame for training
num_rows = int(0.8 * len(df))

# Get the first 80% of the DataFrame
df_train = df[:num_rows]

# Get the remaining 20% of the DataFrame
df_test = df[num_rows:]

In [7]:
df_train['anomaly'].value_counts()

0    7842
1     414
Name: anomaly, dtype: int64

In [8]:
df_test['anomaly'].value_counts()

0    1443
1     621
Name: anomaly, dtype: int64

In [9]:
# Copy the training dataset and replace the values of its anomaly rows with nan 
df_train_nan = df_train.copy(deep=True)
df_train_nan.loc[df_train_nan['anomaly'] == 1, 'value'] = np.nan

In [10]:
# Training data in 
X = df_train.drop(columns=['timestamp', 'anomaly'])
X_nan = df_train_nan.drop(columns=['timestamp', 'anomaly'])

In [11]:
# Delete unused dataframe
del df
del df_train_nan

# Common Function

In [12]:
def RMSE(original, filled):
    from sklearn.metrics import mean_squared_error

    score = np.sqrt(mean_squared_error(original, filled))
    return score


def MAE(original, filled):
    from sklearn.metrics import mean_absolute_error

    score = mean_absolute_error(original, filled)
    return score


def MAPE(original, filled):
    from sklearn.metrics import mean_absolute_percentage_error

    score = mean_absolute_percentage_error(original, filled)
    return score

def metric_calc(X_filled, complete_data):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaled_complete_data = scaler.fit_transform(complete_data)
    scaled_X_filled = scaler.fit_transform(X_filled)

    rmse = RMSE(scaled_complete_data, scaled_X_filled)
    print("RMSE=", rmse)

    mae = MAE(scaled_complete_data, scaled_X_filled)
    print("MAE=", mae)

    mape = MAPE(scaled_complete_data, scaled_X_filled)
    print("MAPE=", mape)

# MissForest

In [13]:
imputer = MissForest(random_state=1337)
X_filled = imputer.fit_transform(X_nan)
metric_calc(X_filled, X)

Iteration: 0
Iteration: 1
Iteration: 2
RMSE= 0.066554510752349
MAE= 0.029065705605619097
MAPE= 0.09664555230880523


# KNNImputer

In [14]:
imputer = KNNImputer(n_neighbors=4, weights="uniform")
X_filled = imputer.fit_transform(X_nan)
metric_calc(X_filled, X)

RMSE= 0.068695774259528
MAE= 0.029648189543666027
MAPE= 0.09358812795175249


# SimpleImputer

In [15]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_filled = imputer.fit_transform(X_nan)
metric_calc(X_filled, X)

RMSE= 0.06583802450655411
MAE= 0.028846542538598027
MAPE= 0.0963735704144719
