In [None]:
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
sys.path.insert(1, '/home/jupyter/ds_toolkit')
from ds_toolkit import tools
from pylab import rcParams
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
from pandas.plotting import register_matplotlib_converters

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

register_matplotlib_converters()
sns.set(style='whitegrid',palette='muted',font_scale=1.5)
rcParams['figure.figsize'] = 22, 10

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
dataset = pd.read_csv('timeseries_data.csv',parse_dates=['usage_date'],index_col='usage_date')

In [None]:
dataset.info()

In [None]:
#define parameter of interest
#poi = 'acct_ttl_unit_hours'
poi = 'z5'
#poi = 'z10'
#poi = 'z15'

In [None]:
plt.plot(dataset.index,dataset[poi])

## split the data

In [None]:
train_size = int(len(dataset)*0.75)
test_size = len(dataset) - train_size

train, test = dataset.iloc[0:train_size],dataset.iloc[train_size:len(dataset)]
print(train.shape,test.shape)

## scale the data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler = scaler.fit(train[[poi]])

In [None]:
#apply transformation
train[poi] = scaler.transform(train[[poi]])
test[poi] = scaler.transform(test[[poi]])

In [None]:
def create_dataset(X,y,time_steps=1):
    Xs, ys = [],[]
    for i in range(len(X)-time_steps):
        v=X.iloc[i:(i+time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i+time_steps])
    return np.array(Xs), np.array(ys)

In [None]:
TIME_STEPS = 30

X_train, y_train = create_dataset(train[[poi]],train[poi],TIME_STEPS)
X_test, y_test = create_dataset(test[[poi]],test[poi],TIME_STEPS)

In [None]:
y_train.shape

# LSTM Autoencoder

In [None]:
#idea use LSTM to fit time series, when error(pred-actual) exceeds a threshold, label as anomaly

In [None]:
model = keras.Sequential()

#first layer
model.add(keras.layers.LSTM(
    units=64,
    input_shape=(X_train.shape[1],X_train.shape[2])
))

#add regularization layer
model.add(keras.layers.Dropout(rate=0.1))

model.add(keras.layers.RepeatVector(n=X_train.shape[1]))

model.add(keras.layers.LSTM(
    units=64,
    return_sequences=True
))

#add another regularization layer
model.add(keras.layers.Dropout(rate=0.1))

#add time-distributed layer for anomaly detection
model.add(keras.layers.TimeDistributed(keras.layers.Dense(
   units=X_train.shape[2]
)))

model.compile(loss='mae', optimizer='adam')
model.summary()

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=16,
    batch_size=32,
    validation_split=0.1,
    shuffle=False
    )
plt.plot(history.history['loss'],label='train')
plt.plot(history.history['val_loss'],label='test')
plt.legend();

In [None]:
X_train_pred = model.predict(X_train)
train_mae_loss = np.mean(np.abs(X_train_pred-X_train),axis=1)

In [None]:
X_train.shape[2]

In [None]:
sns.distplot(train_mae_loss,bins=60,kde=True)

In [None]:
### Do the same for the test set

In [None]:
X_test_pred = model.predict(X_test)
test_mae_loss = np.mean(np.abs(X_test_pred-X_test),axis=1)

In [None]:
model.predict(X_test)

In [None]:
THRESHOLD = 0.25
BENCHMARK = 2.

test_score_df = pd.DataFrame(index=test[TIME_STEPS:].index)
test_score_df['loss'] = test_mae_loss
test_score_df['threshold'] = THRESHOLD
test_score_df['benchmark'] = BENCHMARK
test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold
test_score_df[poi] = test[TIME_STEPS:][poi]
test_score_df['anomaly_benchmark'] = abs(scaler.inverse_transform(test_score_df[poi])) > test_score_df.benchmark
#
test_score_df['acct_ttl_unit_hours']=test[TIME_STEPS:]['acct_ttl_unit_hours']

In [None]:
test_mae_loss.shape
X_test_pred.shape
#test_score_df['pred'] = X_test_pred[0].ravel()

In [None]:
plt.plot(test_score_df.index,test_score_df.loss,label='loss')
sns.scatterplot(test_score_df.index,test_score_df.loss,
    color=sns.color_palette()[4],
    s=52,
    label='data'
)
plt.plot(test_score_df.index,test_score_df.threshold,label='threshold')
plt.xticks(rotation=25)
plt.legend();


In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
test_score_df.shape

In [None]:
#create anomalies dataset
anomalies = test_score_df[test_score_df.anomaly == True]
benchmarks = test_score_df[test_score_df.anomaly_benchmark == True]

In [None]:
plt.plot(
    test[TIME_STEPS:].index,
    scaler.inverse_transform(test[TIME_STEPS:][poi]),
    label=poi
)

sns.scatterplot(
    test[TIME_STEPS:].index,
    scaler.inverse_transform(test[TIME_STEPS:][poi]),
    color=sns.color_palette()[4],
    s=52,
    label='data'
)

sns.scatterplot(
    anomalies.index,
    scaler.inverse_transform(anomalies[poi]),
    color=sns.color_palette()[3],
    s=52,
    label='anomaly'
)

plt.xticks(rotation=25)
plt.legend();

In [None]:
plt.plot(
    test[TIME_STEPS:].index,
    scaler.inverse_transform(test[TIME_STEPS:]['acct_ttl_unit_hours']),
    label='acct_ttl_unit_hours'
)

sns.scatterplot(
    test[TIME_STEPS:].index,
    scaler.inverse_transform(test[TIME_STEPS:]['acct_ttl_unit_hours']),
    color=sns.color_palette()[1],
    s=92,
   label='actual'
)


sns.scatterplot(
    anomalies.index,
    scaler.inverse_transform(anomalies['acct_ttl_unit_hours']),
    color=sns.color_palette()[2],
    s=92,
   label='anomaly'
)

#sns.scatterplot(
#    benchmarks.index,
#    scaler.inverse_transform(benchmarks['acct_ttl_unit_hours']),
#    color=sns.color_palette()[3],
#    s=92,
#    label='benchmark'
#)

plt.xticks(rotation=25)
plt.legend();