In [None]:
import pandas as pd
import numpy as np
from missingpy import MissForest
from sklearn.impute import KNNImputer, SimpleImputer

# Preprocessing

In [None]:
# Read the dataset
raw = pd.read_csv("data/nyc_taxi.csv",low_memory=False)
raw['timestamp'] = pd.to_datetime(raw['timestamp'])

# Preview raw dataset
raw

In [None]:
# The times of anomaly events.
anomaly_points = [
        [
            "2014-10-30 15:30:00.000000",
            "2014-11-03 22:30:00.000000"
        ],
        [
            "2014-11-25 12:00:00.000000",
            "2014-11-29 19:00:00.000000"
        ],
        [
            "2014-12-23 11:30:00.000000",
            "2014-12-27 18:30:00.000000"
        ],
        [
            "2014-12-29 21:30:00.000000",
            "2015-01-03 04:30:00.000000"
        ],
        [
            "2015-01-24 20:30:00.000000",
            "2015-01-29 03:30:00.000000"
        ]
]

In [None]:
# Labeling: if anomaly then 1 else 0
raw['anomaly'] = 0  # Set default values
for start, end in anomaly_points:
    raw.loc[((raw['timestamp'] >= start) & (raw['timestamp'] <= end)), 'anomaly'] = 1

In [None]:
# Convert the timestamp
df = pd.DataFrame()
df['year'] = raw['timestamp'].dt.year
df['month'] = raw['timestamp'].dt.month
df['day'] = raw['timestamp'].dt.day
df['hour'] = raw['timestamp'].dt.hour
# df['minute'] = raw['timestamp'].dt.minute
df['value'] = raw['value']
df['anomaly'] = raw['anomaly']

# delete unused dataframe
del raw

# Preview dataset
df

## Time Series Split

In [None]:
# Calculate the number of rows representing 80% of the DataFrame for training
num_rows = int(0.8 * len(df))

# Get the first 80% of the DataFrame
df_train = df[:num_rows]

# Get the remaining 20% of the DataFrame
df_test = df[num_rows:]

# delete unused dataframe
del df

In [None]:
df_train['anomaly'].value_counts()

In [None]:
df_test['anomaly'].value_counts()

# Inputation dataset

Training dataset

In [None]:
# Split the normal and the anomaly data
df_train_normal = df_train[df_train['anomaly'] == 0]
df_train_anomaly = df_train[df_train['anomaly'] == 1]

# Normal training data
df_train_normal_nan = df_train_normal.copy(deep=True)

# Randomly replace 10% of the normal values with nan
num_replaced = int(0.1 * len(df_train_normal_nan))
random_indices = np.random.choice(df_train_normal_nan.index, size=num_replaced, replace=False)
df_train_normal_nan.loc[random_indices, 'value'] = np.nan

# Anomaly training data
df_train_anomaly_nan = df_train_anomaly.copy(deep=True)

# Replace all of the anomaly values with nan
df_train_anomaly_nan['value'] = np.nan

In [None]:
# Convert to numpy
X_train_normal = df_train_normal.drop(columns=['anomaly']).to_numpy()
X_train_normal_nan = df_train_normal_nan.drop(columns=['anomaly']).to_numpy()
X_train_anomaly = df_train_anomaly.drop(columns=['anomaly']).to_numpy()
X_train_anomaly_nan = df_train_anomaly_nan.drop(columns=['anomaly']).to_numpy()

# y_train_normal = df_train_normal[['anomaly']].to_numpy()
# y_train_normal_nan = df_train_normal_nan[['anomaly']].to_numpy()
# y_train_anomaly = df_train_anomaly[['anomaly']].to_numpy()
# y_train_anomaly_nan = df_train_anomaly_nan[['anomaly']].to_numpy()


In [None]:
# Remove unused dataframe
del df_train
del df_train_normal
del df_train_normal_nan
del df_train_anomaly
del df_train_anomaly_nan

Testing dataset

In [None]:
# Split the normal and the anomaly data
df_test_normal = df_test[df_test['anomaly'] == 0]
df_test_anomaly = df_test[df_test['anomaly'] == 1]

# Normal testing data
df_test_normal_nan = df_test_normal.copy(deep=True)

# Randomly replace 10% of the normal values with nan
num_replaced = int(0.1 * len(df_test_normal_nan))
random_indices = np.random.choice(df_test_normal_nan.index, size=num_replaced, replace=False)
df_test_normal_nan.loc[random_indices, 'value'] = np.nan

# Anomaly testing data
df_test_anomaly_nan = df_test_anomaly.copy(deep=True)

# Randomly all of the anomaly values with nan
df_test_anomaly_nan['value'] = np.nan

In [None]:
# Convert to numpy
X_test_normal = df_test_normal.drop(columns=['anomaly']).to_numpy()
X_test_normal_nan = df_test_normal_nan.drop(columns=['anomaly']).to_numpy()
X_test_anomaly = df_test_anomaly.drop(columns=['anomaly']).to_numpy()
X_test_anomaly_nan = df_test_anomaly_nan.drop(columns=['anomaly']).to_numpy()

# y_test_normal = df_test_normal[['anomaly']].to_numpy()
# y_test_normal_nan = df_test_normal_nan[['anomaly']].to_numpy()
# y_test_anomaly = df_test_anomaly[['anomaly']].to_numpy()
# y_test_anomaly_nan = df_test_anomaly_nan[['anomaly']].to_numpy()

In [None]:
# Remove unused dataframe
del df_test
del df_test_normal
del df_test_normal_nan
del df_test_anomaly
del df_test_anomaly_nan

# Common Function

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler


def RMSE(original, filled):
    score = np.sqrt(mean_squared_error(original, filled))
    return score


def MAE(original, filled):
    score = mean_absolute_error(original, filled)
    return score


def MAPE(original, filled):
    score = mean_absolute_percentage_error(original, filled)
    return score


def metric_calc(X_filled, complete_data):
    scaler = MinMaxScaler()
    scaled_complete_data = scaler.fit_transform(complete_data)
    scaled_X_filled = scaler.fit_transform(X_filled)

    rmse = RMSE(scaled_complete_data, scaled_X_filled)
    print("RMSE=", rmse)

    mae = MAE(scaled_complete_data, scaled_X_filled)
    print("MAE=", mae)

    mape = MAPE(scaled_complete_data, scaled_X_filled)
    print("MAPE=", mape)


def transform_metric(imputer, X, X_nan):
    X_filled = imputer.transform(X_nan)
    metric_calc(X_filled, X)

# KNNImputer

In [None]:
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputer.fit(X_train_normal_nan)

In [None]:
print("NORMAL")
print("X_train_normal")
transform_metric(imputer, X_train_normal, X_train_normal_nan)

print("X_test_normal")
transform_metric(imputer, X_test_normal, X_test_normal_nan)

print("\nANOMALY")
print("X_train_anomaly")
transform_metric(imputer, X_train_anomaly, X_train_anomaly_nan)

print("X_test_anomaly")
transform_metric(imputer, X_test_anomaly, X_test_anomaly_nan)

# SimpleImputer

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train_normal_filled = imputer.fit(X_train_normal_nan)

In [None]:
print("NORMAL")
print("X_train_normal")
transform_metric(imputer, X_train_normal, X_train_normal_nan)

print("X_test_normal")
transform_metric(imputer, X_test_normal, X_test_normal_nan)

print("\nANOMALY")
print("X_train_anomaly")
transform_metric(imputer, X_train_anomaly, X_train_anomaly_nan)

print("X_test_anomaly")
transform_metric(imputer, X_test_anomaly, X_test_anomaly_nan)

# MissForest

In [None]:
imputer = MissForest(random_state=1337)
X_train_normal_filled = imputer.fit(X_train_normal_nan)

In [None]:
print("NORMAL")
print("X_train_normal")
transform_metric(imputer, X_train_normal, X_train_normal_nan)

print("X_test_normal")
transform_metric(imputer, X_test_normal, X_test_normal_nan)

print("\nANOMALY")
print("X_train_anomaly")
# transform_metric(imputer, X_train_anomaly, X_train_anomaly_nan)

print("X_test_anomaly")
transform_metric(imputer, X_test_anomaly, X_test_anomaly_nan)