# Summary

This notebook illustrates the approach of transforming temporal data into tabular data by feature extraction.
Using the library TSFresh https://tsfresh.readthedocs.io/en/latest/index.html we can extract temportal features from overlapping rolling windows.

This notebook shows the increased performance of performing AD on temporal features compared to tabular data (see sklearn.ipynb).

In [20]:
import sys
sys.path.append('../src')
import evaluation_utils, data_utils

import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series

import pandas as pd

In [3]:
# Load data
X, y = data_utils.get_data('../data/6_cardio.npz')

In [27]:
# convert the matrix to a dataframe
df = pd.DataFrame(data=X)
# Tsfresh is expecting a long format to handle multivariate time series
long_df = df.reset_index().melt(id_vars='index', var_name='id', value_name='value')

In [31]:
# set sliding rolling window
rolled_df = roll_time_series(long_df, column_id='id', column_sort='index', max_timeshift=30, min_timeshift=10)

Rolling: 100%|██████████| 40/40 [00:11<00:00,  3.35it/s]


In [34]:
# extract temporal features from each rolling window
features = extract_features(rolled_df, column_id='id', column_sort='index')


Feature Extraction: 100%|██████████| 40/40 [05:55<00:00,  8.89s/it]


In [40]:
features = features.reset_index()
# drop columns with missing values
features = features.dropna(axis=1)
# the first two columns are the id and the index
cols = features.columns[2:].values
# pivot the table to have the features as columns
extracted_features = features.pivot(index='level_1', columns='level_0', values=cols)

In [62]:
# the rolling window creation is missing the last samples of each time series
y = y[extracted_features.index.values]

In [64]:
# Define the anomaly detection methods
methods = {
    "Isolation Forest": IsolationForest(contamination=0.1),
    "One-Class SVM": OneClassSVM(nu=0.1),
    "Local Outlier Factor": LocalOutlierFactor(n_neighbors=20, contamination=0.1)
}

# Apply each method
for name, method in methods.items():
    if name == "Local Outlier Factor":
        y_pred = method.fit_predict(extracted_features)
    else:
        method.fit(extracted_features)
        y_pred = method.predict(extracted_features)
    
    # Reshape the prediction values to 0 for valid, 1 for fraud. 
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1

    print(f"{name} Results:")
    print(evaluation_utils.run_evaluation(y, y_pred, do_point_adjustment=True))

Isolation Forest Results:
{'AUCROC': 0.8786715943630837, 'AUCPR': 0.615398124665011, 'F1': 0.7709447221072506, 'Precision': 0.7582417582417582, 'Recall': 0.7840909090909091, 'Adjusted AUCROC': 0.9866261398176293, 'Adjusted AUCPR': 0.8, 'Adjusted F1': 0.8888839506447186, 'Adjusted Precision': 0.8, 'Adjusted Recall': 1.0}
One-Class SVM Results:
{'AUCROC': 0.5343465045592706, 'AUCPR': 0.10304449648711944, 'F1': 0.1868348242363068, 'Precision': 0.10304449648711944, 'Recall': 1.0, 'Adjusted AUCROC': 0.5343465045592706, 'Adjusted AUCPR': 0.10304449648711944, 'Adjusted F1': 0.1868348242363068, 'Adjusted Precision': 0.10304449648711944, 'Adjusted Recall': 1.0}
Local Outlier Factor Results:
{'AUCROC': 0.5390266648245372, 'AUCPR': 0.1082726307190887, 'F1': 0.1762627893112702, 'Precision': 0.09665019220208676, 'Recall': 1.0, 'Adjusted AUCROC': 0.9537993920972645, 'Adjusted AUCPR': 0.5365853658536586, 'Adjusted F1': 0.6984081532167806, 'Adjusted Precision': 0.5365853658536586, 'Adjusted Recall': 1