In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [2]:
import numpy as np
import matplotlib.pyplot as plt                        
import torch
import pandas as pd
import torchvision.models as models
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define parameters

In [3]:
THRESHOLD = 1
NUM_EPOCHS = 50
LEARNING_RATE = 1e-4

In [4]:
# CNN params
CNN_w = 45
CNN_pred_window = 1
CNN_filter1_size = 128
CNN_filter2_size = 32
CNN_kernel_size = 2
CNN_stride = 1
CNN_pool_size = 2

In [5]:
# LSTM params
LSTM_w = 45
LSTM_pred_window = LSTM_w
LSTM_n_layers = 2
LSTM_hidden_dim = 256
LSTM_kernel_size = 2
LSTM_stride = 1
LSTM_pool_size = 2
LSTM_output_size = 1

# Define CNN architechture

In [6]:
#  CNN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        ## layers of a CNN
        
        self.conv1 = nn.Conv1d(1, CNN_filter1_size, CNN_kernel_size, CNN_stride, padding=0)
        
        self.conv2 = nn.Conv1d(CNN_filter1_size, CNN_filter2_size, CNN_kernel_size, CNN_stride, padding=0)

        self.maxpool = nn.MaxPool1d(CNN_pool_size)
        
        self.dim1 = int(0.5 * (0.5 * (CNN_w - 1) - 1)) * CNN_filter2_size
        
        self.lin1 = nn.Linear(self.dim1, CNN_pred_window)

        self.dropout = nn.Dropout(0.25)
    
    def forward(self, x):
        #convolution layer 1
        x = (F.relu(self.conv1(x)))
        x = self.maxpool(x)
        
        #convolution layer 2
        x = (F.relu(self.conv2(x)))
        x = self.maxpool(x)

        x = x.view(-1,self.dim1)
        
        x = self.dropout(x)
        x = self.lin1(x)

        return x

# Define LSTM architecture

In [7]:
class LSTM(nn.Module):
    def __init__(self,batch_size):
        super(LSTM, self).__init__()
        self.batch_size = batch_size
        
        self.lstm = nn.LSTM(input_size=LSTM_w, hidden_size=LSTM_hidden_dim, num_layers=LSTM_n_layers, dropout=0.5)
        self.fc = nn.Linear(LSTM_hidden_dim, LSTM_pred_window)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x, hidden):
        batch_size = x.shape[1]
        x, hidden = self.lstm(x, hidden)
        x = x.contiguous().view(-1, LSTM_hidden_dim)
        x = self.dropout(x)
        out =  (self.fc(x))
        out = out.view(batch_size, -1, LSTM_pred_window)
        out = out[:,-1]
        
        return out, hidden
    
    def init_hidden(self,size): 
        weight = next(self.parameters()).data

        hidden = (weight.new(LSTM_n_layers, size, LSTM_hidden_dim).zero_(),
                      weight.new(LSTM_n_layers, size, LSTM_hidden_dim).zero_())

        return hidden

# Define helper functions

In [8]:
def get_CNN_subsequences(data):
    X = []
    Y = []
    
    for i in range(len(data) - CNN_w - CNN_pred_window):
        X.append(data[i : i + CNN_w])
        Y.append(data[i + CNN_w : i + CNN_w + CNN_pred_window])
    
    X = np.array(X)
    Y = np.array(Y)
    
    return X, Y

In [9]:
def get_LSTM_subsequences(data, ts_data, train_percent, valid_percent):
    X = []
    Y = []
    idx = []
    
    for i in range(len(data) - LSTM_w):
        mean = np.mean(data[i : i + LSTM_w])
        std = np.std(data[i : i + LSTM_w])
        X.append((data[i : i + LSTM_w]))
        Y.append((data[i + 1 : i + LSTM_w + 1]))
        idx.append(ts_data.index[train_percent + valid_percent + i + LSTM_w])
    
    X = np.array(X)
    Y = np.array(Y)
    
    return X, Y,idx

In [10]:
def train_deepant(n_epochs, trainX, trainY, validX, validY, model, optimizer, criterion, save_path, freq=20):
    """returns trained model"""

    target_train = torch.tensor(trainY).type('torch.FloatTensor')
    data_train = torch.tensor(trainX).type('torch.FloatTensor')
    
    target_valid = torch.tensor(validY).type('torch.FloatTensor')
    data_valid = torch.tensor(validX).type('torch.FloatTensor')
    
    train_loss_min = np.Inf
    valid_loss_min = np.Inf
    last_valid_loss= 0
    
    for epoch in range(1, n_epochs+1):
        
        ############
        # training #
        ############
        model.train()

        optimizer.zero_grad()
        output = model(data_train)
        loss = criterion(output, target_train)
        loss.backward()
        optimizer.step()
        train_loss = loss.item()
        
        ##############
        # validation #
        ##############
        model.eval()
        output_valid = model(data_valid)
        
        loss_valid = criterion(output_valid, target_valid)
        valid_loss = loss_valid.item()
        
        if(valid_loss == last_valid_loss):
            print('problem')
            
        last_valid_loss = valid_loss
        if(epoch%freq == 0):
            print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
                epoch, 
                train_loss,
                valid_loss
                ), end='\r')
            
        # save model if validation loss decreases
        if valid_loss < valid_loss_min:
            torch.save(model.state_dict(), save_path)
            valid_loss_min = valid_loss

    return model, output

In [11]:
def forward_back_prop(lstm, optimizer, criterion, inp, target, hidden):
    hidden = tuple([each.data for each in hidden])
    lstm.zero_grad()
    output, hidden = lstm(inp, hidden)
    loss = criterion(output.squeeze(), target)
    loss.backward()
    nn.utils.clip_grad_norm_(lstm.parameters(), 5)
    optimizer.step()

    return loss.item(), hidden

def train_lstm(n_epochs, trainX,trainY, validX,validY,lstm, optimizer, batch_size, 
               size_valid, size_test, criterion, save_path, freq=20):  
    target_train = torch.tensor(trainY).type('torch.FloatTensor')
    data_train = torch.tensor(trainX).type('torch.FloatTensor')
    
    target_valid = torch.tensor(validY).type('torch.FloatTensor')
    data_valid = torch.tensor(validX).type('torch.FloatTensor')
    
    train_loss_min = np.Inf
    valid_loss_min = np.Inf
    last_valid_loss = 0

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch in range(1, n_epochs + 1):
        # Training
        hidden = lstm.init_hidden(batch_size)

        lstm.train()
        train_loss,hidden = forward_back_prop(lstm,optimizer,criterion,data_train,target_train,hidden)
        
        # Validation
        lstm.eval()
        hidden_valid = lstm.init_hidden(size_valid)
        output_valid,hidden_valid = lstm(data_valid,hidden_valid)
        
        loss_valid = criterion(output_valid.squeeze(), target_valid)
        valid_loss = loss_valid.item()
        if(valid_loss == last_valid_loss):
            print('problem')
            
        last_valid_loss = valid_loss
        if(epoch%freq == 0):
            print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
                epoch, 
                train_loss,
                valid_loss
                ), end='\r')
        
        # save model if validation loss decreases
        if valid_loss < valid_loss_min:
            torch.save(lstm.state_dict(), save_path)
            valid_loss_min = valid_loss
        
    return lstm

In [12]:
def calc_f_score(ts_data, df_out, thresh):
    positives = ts_data.loc[df_out.index].loc[ts_data.is_anomaly == 1].index
    negatives = ts_data.loc[df_out.index].loc[ts_data.is_anomaly == 0].index

    tp = []
    fn = []
    fp = []
    tn = []
    for p in positives:
        if p in thresh.index:
            tp.append(p)
        else:
            fn.append(p)

    for n in negatives:
        if n in thresh.index:
            fp.append(n)
        else:
            tn.append(n)
            
    recall = len(tp) / (len(tp) + len(fn))
    
    if recall != 0:
        precision = len(tp) / (len(tp) + len(fp))
        F_score = 2 * recall * precision / (recall + precision)
    else:
        F_score = 0
    
    return F_score

In [13]:
# use CNN to fit model, predict anomalies and calc score
def calc_deepant_performance(filename):
    # load dataset from file
    ts_data = pd.read_csv(filename, index_col = 0)

    # separate test and train
    train_percent = int(0.3 * len(ts_data))
    valid_percent = int(0.1 * len(ts_data))
    test_percent = int(0.6 * len(ts_data))

    train_data = list(ts_data.iloc[:train_percent,0])
    valid_data = list(ts_data.iloc[train_percent:train_percent + valid_percent,0])
    test_data = list(ts_data.iloc[train_percent + valid_percent:,0])

    trainX, trainY = get_CNN_subsequences(train_data)
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    
    validX, validY = get_CNN_subsequences(valid_data)    
    validX = np.reshape(validX, (validX.shape[0], 1, validX.shape[1]))
    
    testX, testY = get_CNN_subsequences(test_data)
    testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

    # specify and fit model
    model = Net()

    criterion_scratch = nn.L1Loss()
    optimizer_scratch = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

    # train model
    model, out = train_deepant(NUM_EPOCHS, trainX, trainY, validX, validY, model, optimizer_scratch, 
                             criterion_scratch, 'model.pt', freq=10)

    # load best saved model
    model.load_state_dict(torch.load('model.pt'));

    # predict value
    test_tensor =  torch.tensor(testX).type('torch.FloatTensor')

    model.eval()

    out = model(test_tensor)
    out = out.detach().numpy()

    df_out = pd.DataFrame()
    df_out['pred'] = out[:, 0]
    df_out['actual'] = testY[:, 0]

    # predict anomalies
    df_out['error'] = np.abs(df_out['pred'] - df_out['actual'])
    df_out['error_n'] = (df_out['error'] - df_out['error'].mean()) / df_out['error'].std()
    df_out.index = ts_data.index[train_percent + valid_percent + CNN_w + CNN_pred_window - 1 : -1]

    thresh = df_out.loc[df_out['error_n'].abs() > THRESHOLD]

    # calc performance score
    f_score = calc_f_score(ts_data, df_out, thresh)
    
    return f_score

In [14]:
# use LSTM model to fit model, predict anomalies and calc score
def calc_lstm_performance(filename):
    ts_data = pd.read_csv(filename, index_col = 0)

    a = list(ts_data.columns)
    a[1] = 'is_anomaly'
    ts_data.columns = a

    train_percent = int(0.3*len(ts_data))
    valid_percent = int(0.1*len(ts_data))
    test_percent = int(0.6*len(ts_data))

    train_data = list(ts_data.iloc[:train_percent,0])
    valid_data = list(ts_data.iloc[train_percent:train_percent+valid_percent,0])
    test_data = list(ts_data.iloc[train_percent+valid_percent:,0])

    trainX,trainY,_ = get_LSTM_subsequences(train_data, ts_data, train_percent, valid_percent)
    trainX = np.reshape(trainX, (1, trainX.shape[0], trainX.shape[1]))

    validX,validY,_ = get_LSTM_subsequences(valid_data, ts_data, train_percent, valid_percent)
    validX = np.reshape(validX, (1, validX.shape[0], validX.shape[1]))

    testX,testY,test_idx = get_LSTM_subsequences(test_data, ts_data, train_percent, valid_percent)
    testX = np.reshape(testX,(1,testX.shape[0],testX.shape[1]))

    batch_size = trainX.shape[1]

    size_valid = validX.shape[1]
    size_test = testX.shape[1]

    lstm = LSTM(batch_size)

    criterion = nn.L1Loss()
    optimizer = optim.Adam(lstm.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

    lstm = train_lstm(NUM_EPOCHS, trainX,trainY, validX,validY, lstm, optimizer, batch_size,
                      size_valid, size_test, criterion, 'lstm.pt', freq=10)

    lstm.load_state_dict(torch.load('lstm.pt'))

    test_tensor =  torch.tensor(testX).type('torch.FloatTensor')
    lstm.eval()

    hidden = lstm.init_hidden(size_test)
    out,hidden = lstm(test_tensor,hidden)
    out = out.detach().numpy()

    df_out = pd.DataFrame()
    df_out['pred'] = out[:,-1]
    df_out['actual'] = testY[:,-1]

    df_out['error'] =np.abs(df_out['pred'] - df_out['actual'])
    df_out['error_n'] = (df_out['error'] - df_out['error'].mean())/df_out['error'].std()

    df_out.index = test_idx

    thresh = df_out.loc[df_out['error_n'].abs() > THRESHOLD]
    thresh['is_anomaly'] = ts_data.loc[thresh.index,'is_anomaly']

    # calc performance score
    f_score = calc_f_score(ts_data, df_out, thresh)
    
    return f_score

# Run for one dataset

In [15]:
yahoo_folder = 'ydata-labeled-time-series-anomalies-v1_0'
synthetic_folder = 'synthetic-labeled-data'

In [16]:
def get_files_in_folder(folder_name):
    ret_val = os.listdir(folder_name)
    ret_val = [folder_name + '/' + x for x in ret_val if 'all' not in x]
    ret_val = [x for x in ret_val if 'csv' in x]
    
    return ret_val

In [17]:
file_list = []
file_list += get_files_in_folder(yahoo_folder + '/A1Benchmark')
file_list += get_files_in_folder(yahoo_folder + '/A2Benchmark')
file_list += get_files_in_folder(yahoo_folder + '/A3Benchmark')
file_list += get_files_in_folder(yahoo_folder + '/A4Benchmark')
file_list += get_files_in_folder(synthetic_folder)

file_list.sort()

file_list[-10:]

['ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS90.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS91.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS92.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS93.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS94.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS95.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS96.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS97.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS98.csv',
 'ydata-labeled-time-series-anomalies-v1_0/A4Benchmark/A4Benchmark-TS99.csv']

In [18]:
output_df = pd.DataFrame(columns=['filename', 'deepant_fscore', 'lstm_fscore'])

for file in file_list[5:10]:
    print(file)
    deepant_fscore = calc_deepant_performance(file)
    lstm_fscore = calc_lstm_performance(file)
    
    output_df = output_df.append({'filename': file,
                                  'deepant_fscore': deepant_fscore,
                                  'lstm_fscore': lstm_fscore}, ignore_index=True)

synthetic-labeled-data/ma.csv
Training for 50 epoch(s)...2.616051 	Validation Loss: 35.249767
synthetic-labeled-data/mmm.csv71584 	Validation Loss: 57.579155


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training for 50 epoch(s)...6.502653 	Validation Loss: 19.113787
synthetic-labeled-data/msft.csv2769 	Validation Loss: 78.248215


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training for 50 epoch(s)....535954 	Validation Loss: 1.87606875
synthetic-labeled-data/nke.csv66077 	Validation Loss: 25.514366


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training for 50 epoch(s)...0.513727 	Validation Loss: 12.106659
synthetic-labeled-data/t.csv.053833 	Validation Loss: 13.843182


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training for 50 epoch(s)....384853 	Validation Loss: 1.20678209
Epoch: 50 	Training Loss: 39.404255 	Validation Loss: 41.065113

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
output_df

Unnamed: 0,filename,deepant_fscore,lstm_fscore
0,synthetic-labeled-data/ma.csv,0.035191,0.033613
1,synthetic-labeled-data/mmm.csv,0.062893,0.037037
2,synthetic-labeled-data/msft.csv,0.040678,0.025806
3,synthetic-labeled-data/nke.csv,0.10101,0.093458
4,synthetic-labeled-data/t.csv,0.063492,0.012346
