In [42]:
import numpy as np
import pandas as pd
from missingpy import MissForest
from sklearn.impute import KNNImputer, SimpleImputer

# Reading Dataset

In [43]:
# Read the dataset
raw = pd.read_csv("data/nyc_taxi.csv",low_memory=False)
raw['timestamp'] = pd.to_datetime(raw['timestamp'])

# Preview raw dataset
raw

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820
...,...,...
10315,2015-01-31 21:30:00,24670
10316,2015-01-31 22:00:00,25721
10317,2015-01-31 22:30:00,27309
10318,2015-01-31 23:00:00,26591


In [44]:
# The times of anomaly events (Ground Truth)
anomaly_points = [
        [
            "2014-10-30 15:30:00.000000",
            "2014-11-03 22:30:00.000000"
        ],
        [
            "2014-11-25 12:00:00.000000",
            "2014-11-29 19:00:00.000000"
        ],
        [
            "2014-12-23 11:30:00.000000",
            "2014-12-27 18:30:00.000000"
        ],
        [
            "2014-12-29 21:30:00.000000",
            "2015-01-03 04:30:00.000000"
        ],
        [
            "2015-01-24 20:30:00.000000",
            "2015-01-29 03:30:00.000000"
        ]
]

# Labeling: if anomaly then 1 else 0
raw['anomaly'] = 0  # Set default values
for start, end in anomaly_points:
    raw.loc[((raw['timestamp'] >= start) & (raw['timestamp'] <= end)), 'anomaly'] = 1

# Preview labeled raw dataset
raw

Unnamed: 0,timestamp,value,anomaly
0,2014-07-01 00:00:00,10844,0
1,2014-07-01 00:30:00,8127,0
2,2014-07-01 01:00:00,6210,0
3,2014-07-01 01:30:00,4656,0
4,2014-07-01 02:00:00,3820,0
...,...,...,...
10315,2015-01-31 21:30:00,24670,0
10316,2015-01-31 22:00:00,25721,0
10317,2015-01-31 22:30:00,27309,0
10318,2015-01-31 23:00:00,26591,0


In [45]:
raw['anomaly'].value_counts()

0    9285
1    1035
Name: anomaly, dtype: int64

# Preprocessing

In [46]:
# Convert the timestamp
df = pd.DataFrame()
df['year'] = raw['timestamp'].dt.year
df['month'] = raw['timestamp'].dt.month
df['day'] = raw['timestamp'].dt.day
df['hour'] = raw['timestamp'].dt.hour
df['value'] = raw['value']
df['anomaly'] = raw['anomaly']

# delete unused dataframe
del raw

In [47]:
# Preview dataset
df

Unnamed: 0,year,month,day,hour,value,anomaly
0,2014,7,1,0,10844,0
1,2014,7,1,0,8127,0
2,2014,7,1,1,6210,0
3,2014,7,1,1,4656,0
4,2014,7,1,2,3820,0
...,...,...,...,...,...,...
10315,2015,1,31,21,24670,0
10316,2015,1,31,22,25721,0
10317,2015,1,31,22,27309,0
10318,2015,1,31,23,26591,0


In [48]:
# Calculate 80% of the number of rows for training
num_rows = int(0.8 * len(df))

# Get the first part of the DataFrame
df_train = df[:num_rows]
# Get the remaining part of the DataFrame
df_test = df[num_rows:]

# delete unused dataframe
del df

In [49]:
df_train['anomaly'].value_counts()

0    7842
1     414
Name: anomaly, dtype: int64

In [50]:
df_test['anomaly'].value_counts()

0    1443
1     621
Name: anomaly, dtype: int64

# Metrix Calculation

In [51]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler


# Root Square Error
def RSE(original, filled):
    score = np.sqrt(np.power(original-filled,2))
    return score


# Absolute Error
def AE(original, filled):
    score = np.absolute(original-filled)
    return score


# Absolute Percentage Error
def APE(original, filled):
    score = np.absolute(original-filled)/original
    return score


# Root Mean Square Error
def RMSE(original, filled):
    score = mean_squared_error(original, filled, squared=False)
    return score


# Mean Absolute Error
def MAE(original, filled):
    score = mean_absolute_error(original, filled)
    return score


# Mean Absolute Percentage Error
def MAPE(original, filled):
    score = mean_absolute_percentage_error(original, filled)
    return score


def metric_RMSE_MAE_MAPE(original, filled):
    rmse = RMSE(original, filled)
    print("RMSE=", rmse)

    mae = MAE(original, filled)
    print("MAE=", mae)

    mape = MAPE(original, filled)
    print("MAPE=", mape)

    return rmse, mae, mape


def metric_RSE_AE_APE(original, filled):
    rmse = RSE(original, filled)
    print("RSE=", rmse)

    mae = AE(original, filled)
    print("AE=", mae)

    mape = APE(original, filled)
    print("APE=", mape)

    return rmse, mae, mape

# DEBUG
# a = np.array([10, 20])
# b = np.array([12, 22])

# print("RSE=", RSE(a, b))
# print("AE=", AE(a, b))
# print("AP=", APE(a, b))
# print("RMSE=", RMSE(a, b))
# print("MAE=", MAE(a, b))
# print("MAPE=", MAPE(a, b))

# Finding Threshold

In [52]:
def train(imputer, X, y, percentage=0.1, imputed_col=-1):
    # Calculate the number of rows to select (10% of the total rows)
    num_rows_to_select = int(percentage * X.shape[0])

    # Randomly select row indices to change
    random_indices = np.random.choice(X.shape[0], size=num_rows_to_select, replace=False)

    # Change the value column to NaN for the selected rows
    X[random_indices, imputed_col] = np.nan

    X_imputation = imputer.fit_transform(X)

    # The results of the training process is the RMSE, MAE, and MAPE values
    return metric_RMSE_MAE_MAPE(y[random_indices], X_imputation[random_indices, imputed_col])



In [53]:
# Get normal training data then convert it to numpy float
X = df_train[df_train['anomaly'] == 0].drop(columns=['anomaly']).to_numpy(dtype=float)
# Copy the 'value' column of normal training data
y = X[:, -1].copy()

imputer = KNNImputer(n_neighbors=4, weights="uniform")
rmse, mae, mape = train(imputer, X, y)

RMSE= 2778.0499062128883
MAE= 1967.547193877551
MAPE= 0.22624169437293468


In [41]:
# Get normal training data then convert it to numpy float
X = df_train.drop(columns=['anomaly']).to_numpy(dtype=float)


for i in range(X.shape[0]):
    X[i,-1] = np.nan
    print(X[:,-1])

[   nan  8127.  6210. ... 26723. 25807. 26432.]
[   nan    nan  6210. ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 26432.]
[   nan    nan    nan ... 26723. 25807. 

In [34]:
from numpy.lib.stride_tricks import sliding_window_view

x = np.arange(100, dtype=float)
x = x.reshape(20,5)

print(x)

for v in sliding_window_view(x, [2,x.shape[1]]):
    # w = v.copy()
    # w[5] = np.nan
    print(v)

# print(v)

[[ 0.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]
 [10. 11. 12. 13. 14.]
 [15. 16. 17. 18. 19.]
 [20. 21. 22. 23. 24.]
 [25. 26. 27. 28. 29.]
 [30. 31. 32. 33. 34.]
 [35. 36. 37. 38. 39.]
 [40. 41. 42. 43. 44.]
 [45. 46. 47. 48. 49.]
 [50. 51. 52. 53. 54.]
 [55. 56. 57. 58. 59.]
 [60. 61. 62. 63. 64.]
 [65. 66. 67. 68. 69.]
 [70. 71. 72. 73. 74.]
 [75. 76. 77. 78. 79.]
 [80. 81. 82. 83. 84.]
 [85. 86. 87. 88. 89.]
 [90. 91. 92. 93. 94.]
 [95. 96. 97. 98. 99.]]
[[[0. 1. 2. 3. 4.]
  [5. 6. 7. 8. 9.]]]
[[[ 5.  6.  7.  8.  9.]
  [10. 11. 12. 13. 14.]]]
[[[10. 11. 12. 13. 14.]
  [15. 16. 17. 18. 19.]]]
[[[15. 16. 17. 18. 19.]
  [20. 21. 22. 23. 24.]]]
[[[20. 21. 22. 23. 24.]
  [25. 26. 27. 28. 29.]]]
[[[25. 26. 27. 28. 29.]
  [30. 31. 32. 33. 34.]]]
[[[30. 31. 32. 33. 34.]
  [35. 36. 37. 38. 39.]]]
[[[35. 36. 37. 38. 39.]
  [40. 41. 42. 43. 44.]]]
[[[40. 41. 42. 43. 44.]
  [45. 46. 47. 48. 49.]]]
[[[45. 46. 47. 48. 49.]
  [50. 51. 52. 53. 54.]]]
[[[50. 51. 52. 53. 54.]
  [55. 56. 57. 58. 59.]]]

In [17]:





X = df_train.drop(columns=['anomaly']).to_numpy(dtype=float)
y = X[:, -1].copy()

imputer = KNNImputer(n_neighbors=4, weights="uniform")
X_imputation = imputer.fit_transform(X)
rse, ae, ape = metric_RSE_AE_APE(X_imputation[:,-1], y)

RSE= [0. 0. 0. ... 0. 0. 0.]
AE= [0. 0. 0. ... 0. 0. 0.]
APE= [0. 0. 0. ... 0. 0. 0.]


In [None]:
# imputer = KNNImputer(n_neighbors=4, weights="uniform")
# calc_threshold(imputer, X_train_anomaly, y_train_anomaly)

In [None]:
# imputer = KNNImputer(n_neighbors=4, weights="uniform")
# calc_threshold(imputer, X_test_normal, y_test_normal)

In [None]:
# imputer = KNNImputer(n_neighbors=4, weights="uniform")
# calc_threshold(imputer, X_test_anomaly, y_test_anomaly)

In [None]:
# # Calculate the number of rows to select (10% of the total rows)
# num_rows_to_select = int(0.1 * X_train_normal.shape[0])

# # Randomly select row indices to change
# random_indices = np.random.choice(X_train_normal.shape[0], size=num_rows_to_select, replace=False)

# # Change the value column to NaN for the selected rows
# X_train_normal[random_indices, -1] = np.nan

In [None]:
# imputer = KNNImputer(n_neighbors=4, weights="uniform")
# X_train_imputation = imputer.fit_transform(X_train_normal)

In [None]:
# metric_calc(y_train_normal[random_indices], X_train_imputation[random_indices, -1])

Anomaly data

In [None]:
# # Calculate the number of rows to select (10% of the total rows)
# num_rows_to_select = int(0.1 * X_train_anomaly.shape[0])

# # Randomly select row indices to change
# random_indices = np.random.choice(X_train_anomaly.shape[0], size=num_rows_to_select, replace=False)

# # Change the value column to NaN for the selected rows
# X_train_anomaly[random_indices, -1] = np.nan

In [None]:
# imputer = KNNImputer(n_neighbors=4, weights="uniform")
# X_train_imputation = imputer.fit_transform(X_train_anomaly)

In [None]:
# X_train_anomaly[random_indices].shape

In [None]:
# X_train_imputation[random_indices, -1].shape

In [None]:
# metric_calc(y_train_anomaly[random_indices], X_train_imputation[random_indices, -1])