In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
import numpy as np
import matplotlib.pyplot as plt                        
import torch
import pandas as pd
import torchvision.models as models
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define parameters

In [3]:
THRESHOLD = 2
NUM_EPOCHS = 50

In [4]:
# CNN params
w = 45
pred_window = 1
filter1_size = 128
filter2_size = 32
kernel_size = 2
stride = 1
pool_size = 2

# Define CNN architechture

In [5]:
#  CNN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        ## layers of a CNN
        
        self.conv1 = nn.Conv1d(1,filter1_size,kernel_size,stride,padding = 0)
        
        self.conv2 = nn.Conv1d(filter1_size,filter2_size,kernel_size,stride,padding = 0)

        self.maxpool = nn.MaxPool1d(pool_size)
        
        self.dim1 = int(0.5*(0.5*(w-1)-1)) * filter2_size
        
        self.lin1 = nn.Linear(self.dim1,pred_window )

        self.dropout = nn.Dropout(0.25)
    
    def forward(self, x):
        #convolution layer 1
        x = (F.relu(self.conv1(x)))
        x = self.maxpool(x)
        
        #convolution layer 2
        x = (F.relu(self.conv2(x)))
        x = self.maxpool(x)

        x = x.view(-1,self.dim1)
        
        x = self.dropout(x)
        x = self.lin1(x)

        return x

# Define helper functions

In [6]:
def get_subsequences(data):
    X = []
    Y = []
    
    for i in range(len(data) - w - pred_window):
        X.append(data[i : i + w])
        Y.append(data[i + w : i + w + pred_window])
    
    X = np.array(X)
    Y = np.array(Y)
    
    X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
    
    return X, Y

In [7]:
def train_valid(n_epochs, trainX, trainY, validX, validY, model, optimizer, criterion, save_path, freq=20):
    """returns trained model"""

    target_train = torch.tensor(trainY).type('torch.FloatTensor')
    data_train = torch.tensor(trainX).type('torch.FloatTensor')
    
    target_valid = torch.tensor(validY).type('torch.FloatTensor')
    data_valid = torch.tensor(validX).type('torch.FloatTensor')
    
    train_loss_min = np.Inf
    valid_loss_min = np.Inf
    last_valid_loss= 0
    
    for epoch in range(1, n_epochs+1):
        
        ############
        # training #
        ############
        model.train()

        optimizer.zero_grad()
        output = model(data_train)
        loss = criterion(output, target_train)
        loss.backward()
        optimizer.step()
        train_loss = loss.item()
        
        ##############
        # validation #
        ##############
        model.eval()
        output_valid = model(data_valid)
        
        loss_valid = criterion(output_valid, target_valid)
        valid_loss = loss_valid.item()
        
        if(valid_loss == last_valid_loss):
            print('problem')
            
        last_valid_loss = valid_loss
        if(epoch%freq == 0):
            print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
                epoch, 
                train_loss,
                valid_loss
                ), end='\r')
            
        # save model if validation loss decreases
        if valid_loss < valid_loss_min:
            torch.save(model.state_dict(), save_path)
            valid_loss_min = valid_loss

    return model,output

In [8]:
def calc_f_score(ts_data, df_out, thresh):
    positives = ts_data.loc[df_out.index].loc[ts_data.is_anomaly == 1].index
    negatives = ts_data.loc[df_out.index].loc[ts_data.is_anomaly == 0].index

    tp = []
    fn = []
    fp = []
    tn = []
    for p in positives:
        if p in thresh.index:
            tp.append(p)
        else:
            fn.append(p)

    for n in negatives:
        if n in thresh.index:
            fp.append(n)
        else:
            tn.append(n)
            
    recall = len(tp) / (len(tp) + len(fn))
    precision = len(tp) / (len(tp) + len(fp))
    F_score = 2 * recall * precision / (recall + precision)
    
    return F_score

In [9]:
# main function to fit model, predict anomalies and calc score
def calc_model_performance(filename):
    # load dataset from file
    ts_data = pd.read_csv(filename, index_col = 0)

    # separate test and train
    train_percent = int(0.3 * len(ts_data))
    valid_percent = int(0.1 * len(ts_data))
    test_percent = int(0.6 * len(ts_data))

    train_data = list(ts_data.iloc[:train_percent,0])
    valid_data = list(ts_data.iloc[train_percent:train_percent + valid_percent,0])
    test_data = list(ts_data.iloc[train_percent + valid_percent:,0])

    trainX, trainY = get_subsequences(train_data)
    validX, validY = get_subsequences(valid_data)
    testX, testY = get_subsequences(test_data)

    # specify and fit model
    model = Net()

    criterion_scratch = nn.L1Loss()
    optimizer_scratch = optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-6)

    # train model
    model, out = train_valid(NUM_EPOCHS, trainX, trainY, validX, validY, model, optimizer_scratch, 
                             criterion_scratch, 'model.pt', freq = 10)

    # load best saved model
    model.load_state_dict(torch.load('model.pt'));

    # predict value
    test_tensor =  torch.tensor(testX).type('torch.FloatTensor')

    model.eval()

    out = model(test_tensor)
    out = out.detach().numpy()

    df_out = pd.DataFrame()
    df_out['pred'] = out[:, 0]
    df_out['actual'] = testY[:, 0]

    # predict anomalies
    df_out['error'] = np.abs(df_out['pred'] - df_out['actual'])
    df_out['error_n'] = (df_out['error'] - df_out['error'].mean()) / df_out['error'].std()
    df_out.index = ts_data.index[train_percent + valid_percent + w + pred_window - 1 : -1]

    thresh = df_out.loc[df_out['error_n'].abs() > THRESHOLD]

    # calc performance score
    f_score = calc_f_score(ts_data, df_out, thresh)
    
    return f_score

# Run for one dataset

In [10]:
yahoo_folder = 'ydata-labeled-time-series-anomalies-v1_0'
synthetic_folder = 'synthetic-labeled-data'

In [11]:
calc_model_performance(yahoo_folder + '/A1Benchmark/real_60.csv')

Epoch: 50 	Training Loss: 0.783861 	Validation Loss: 0.759618

0.7586206896551725