<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Training-on-A1-BenchMark-data" data-toc-modified-id="Training-on-A1-BenchMark-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Training on A1 BenchMark data</a></span></li></ul></div>

In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
import numpy as np
import matplotlib.pyplot as plt                        
import torch
import pandas as pd
import torchvision.models as models
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define parameters

In [3]:
THRESHOLD = 0.5
YAHOO_FOLDER = 'ydata-labeled-time-series-anomalies-v1_0'
SYNTHETIC_FOLDER = 'synthetic-labeled-data'

# Training on A1 BenchMark data

In [4]:
ts_data = pd.read_csv(YAHOO_FOLDER + '/A1Benchmark/real_60.csv',index_col = 0)
ts_data

Unnamed: 0_level_0,value,is_anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.265278,0
2,1.100833,0
3,1.147778,0
4,1.053889,0
5,1.051944,0
...,...,...
1457,2.680556,1
1458,3.063889,1
1459,2.462778,1
1460,1.616667,0


In [5]:
ts_data[['is_anomaly','value']].groupby('is_anomaly').count()

Unnamed: 0_level_0,value
is_anomaly,Unnamed: 1_level_1
0,1445
1,16


In [6]:
train_percent = int(0.3*len(ts_data))
valid_percent = int(0.1*len(ts_data))
test_percent = int(0.6*len(ts_data))

train_data = list(ts_data.iloc[:train_percent,0])
valid_data = list(ts_data.iloc[train_percent:train_percent+valid_percent,0])
test_data = list(ts_data.iloc[train_percent+valid_percent:,0])

In [7]:
w = 45
pred_window = 1
filter1_size = 128
filter2_size = 32
kernel_size = 2
stride = 1
pool_size = 2

In [8]:
def get_subsequences(data):
    X = []
    Y = []
    
    for i in range(len(data) - w -pred_window):
        X.append(data[i:i+w])
        Y.append(data[i+w:i+w+pred_window])
    return np.array(X),np.array(Y)

trainX,trainY = get_subsequences(train_data)
trainX = np.reshape(trainX,(trainX.shape[0],1,trainX.shape[1]))

validX,validY = get_subsequences(valid_data)
validX = np.reshape(validX,(validX.shape[0],1,validX.shape[1]))

testX,testY = get_subsequences(test_data)
testX = np.reshape(testX,(testX.shape[0],1,testX.shape[1]))


In [9]:
#  CNN architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        ## layers of a CNN
        
        self.conv1 = nn.Conv1d(1,filter1_size,kernel_size,stride,padding = 0)
        
        self.conv2 = nn.Conv1d(filter1_size,filter2_size,kernel_size,stride,padding = 0)

        self.maxpool = nn.MaxPool1d(pool_size)
        
        self.dim1 = int(0.5*(0.5*(w-1)-1)) * filter2_size
        
        self.lin1 = nn.Linear(self.dim1,pred_window )
        #self.lin2 = nn.Linear(1000,pred_window)
        self.dropout = nn.Dropout(0.25)
    
    def forward(self, x):
        
        #convolution layer 1
        x = (F.relu(self.conv1(x)))
        x = self.maxpool(x)
        #print(x.shape)
        #x = self.dropout(x)


             
        #convolution layer 2
        x = (F.relu(self.conv2(x)))
        x = self.maxpool(x)
        #x = self.dropout(x)

        #print(x.shape)


        #print(x.shape)
        #print(int(0.25* (w) * filter2_size))
        x = x.view(-1,self.dim1)
        
        x = self.dropout(x)
        x = self.lin1(x)
        #x = self.dropout(x)
        #x = self.lin2(x)

        return x

In [10]:
model_A1 = Net()
print(model_A1)

Net(
  (conv1): Conv1d(1, 128, kernel_size=(2,), stride=(1,))
  (conv2): Conv1d(128, 32, kernel_size=(2,), stride=(1,))
  (maxpool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lin1): Linear(in_features=320, out_features=1, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)


In [11]:
criterion_scratch = nn.L1Loss()
optimizer_scratch = optim.Adam(model_A1.parameters(), lr = 1e-5,weight_decay=1e-6)

In [12]:
def train_valid(n_epochs, trainX,trainY, validX,validY,model, optimizer, criterion,save_path,freq = 20):
    """returns trained model"""

    target_train = torch.tensor(trainY).type('torch.FloatTensor')
    data_train = torch.tensor(trainX).type('torch.FloatTensor')
    
    target_valid = torch.tensor(validY).type('torch.FloatTensor')
    data_valid = torch.tensor(validX).type('torch.FloatTensor')
    
    train_loss_min = np.Inf
    valid_loss_min = np.Inf
    last_valid_loss= 0
    
    for epoch in range(1, n_epochs+1):
        
        ###################
        # training the model #
        ###################
        model.train()

        #print(data.shape)


        optimizer.zero_grad()
        output = model(data_train)
        loss = criterion(output, target_train)
        loss.backward()
        optimizer.step()
        train_loss = loss.item()
        
        ###################
        # Validation #
        ###################
        model.eval()
        output_valid = model(data_valid)
        
        loss_valid = criterion(output_valid, target_valid)
        valid_loss = loss_valid.item()
        if(valid_loss == last_valid_loss):
            print('problem')
            
        last_valid_loss = valid_loss
        if(epoch%freq == 0):
            print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
                epoch, 
                train_loss,
                valid_loss
                ))
            
        if valid_loss < valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(model.state_dict(), save_path)
            valid_loss_min = valid_loss
        
        


    return model,output


In [13]:
#!pip install -U numpy

In [14]:
model_A1,out = train_valid(500, trainX,trainY,validX,validY, model_A1, optimizer_scratch, 
                      criterion_scratch, 'model_A1.pt',freq = 10)

Validation loss decreased (inf --> 0.975113).  Saving model ...
Validation loss decreased (0.975113 --> 0.973752).  Saving model ...
Validation loss decreased (0.973752 --> 0.972388).  Saving model ...
Validation loss decreased (0.972388 --> 0.971019).  Saving model ...
Validation loss decreased (0.971019 --> 0.969651).  Saving model ...
Validation loss decreased (0.969651 --> 0.968277).  Saving model ...
Validation loss decreased (0.968277 --> 0.966903).  Saving model ...
Validation loss decreased (0.966903 --> 0.965530).  Saving model ...
Validation loss decreased (0.965530 --> 0.964157).  Saving model ...
Epoch: 10 	Training Loss: 1.018247 	Validation Loss: 0.962784
Validation loss decreased (0.964157 --> 0.962784).  Saving model ...
Validation loss decreased (0.962784 --> 0.961411).  Saving model ...
Validation loss decreased (0.961411 --> 0.960035).  Saving model ...
Validation loss decreased (0.960035 --> 0.958654).  Saving model ...
Validation loss decreased (0.958654 --> 0.9572

Validation loss decreased (0.821682 --> 0.820290).  Saving model ...
Validation loss decreased (0.820290 --> 0.818893).  Saving model ...
Validation loss decreased (0.818893 --> 0.817496).  Saving model ...
Validation loss decreased (0.817496 --> 0.816098).  Saving model ...
Validation loss decreased (0.816098 --> 0.814702).  Saving model ...
Validation loss decreased (0.814702 --> 0.813304).  Saving model ...
Validation loss decreased (0.813304 --> 0.811907).  Saving model ...
Epoch: 120 	Training Loss: 0.875461 	Validation Loss: 0.810508
Validation loss decreased (0.811907 --> 0.810508).  Saving model ...
Validation loss decreased (0.810508 --> 0.809108).  Saving model ...
Validation loss decreased (0.809108 --> 0.807707).  Saving model ...
Validation loss decreased (0.807707 --> 0.806302).  Saving model ...
Validation loss decreased (0.806302 --> 0.804895).  Saving model ...
Validation loss decreased (0.804895 --> 0.803488).  Saving model ...
Validation loss decreased (0.803488 --> 

Validation loss decreased (0.658650 --> 0.657055).  Saving model ...
Validation loss decreased (0.657055 --> 0.655456).  Saving model ...
Validation loss decreased (0.655456 --> 0.653854).  Saving model ...
Validation loss decreased (0.653854 --> 0.652249).  Saving model ...
Validation loss decreased (0.652249 --> 0.650640).  Saving model ...
Validation loss decreased (0.650640 --> 0.649027).  Saving model ...
Validation loss decreased (0.649027 --> 0.647410).  Saving model ...
Epoch: 230 	Training Loss: 0.716848 	Validation Loss: 0.645790
Validation loss decreased (0.647410 --> 0.645790).  Saving model ...
Validation loss decreased (0.645790 --> 0.644166).  Saving model ...
Validation loss decreased (0.644166 --> 0.642543).  Saving model ...
Validation loss decreased (0.642543 --> 0.640915).  Saving model ...
Validation loss decreased (0.640915 --> 0.639282).  Saving model ...
Validation loss decreased (0.639282 --> 0.637645).  Saving model ...
Validation loss decreased (0.637645 --> 

Validation loss decreased (0.482394 --> 0.480812).  Saving model ...
Validation loss decreased (0.480812 --> 0.479232).  Saving model ...
Validation loss decreased (0.479232 --> 0.477654).  Saving model ...
Validation loss decreased (0.477654 --> 0.476078).  Saving model ...
Validation loss decreased (0.476078 --> 0.474502).  Saving model ...
Validation loss decreased (0.474502 --> 0.472925).  Saving model ...
Validation loss decreased (0.472925 --> 0.471350).  Saving model ...
Validation loss decreased (0.471350 --> 0.469775).  Saving model ...
Epoch: 340 	Training Loss: 0.537766 	Validation Loss: 0.468199
Validation loss decreased (0.469775 --> 0.468199).  Saving model ...
Validation loss decreased (0.468199 --> 0.466625).  Saving model ...
Validation loss decreased (0.466625 --> 0.465051).  Saving model ...
Validation loss decreased (0.465051 --> 0.463481).  Saving model ...
Validation loss decreased (0.463481 --> 0.461912).  Saving model ...
Validation loss decreased (0.461912 --> 

Validation loss decreased (0.320576 --> 0.319218).  Saving model ...
Validation loss decreased (0.319218 --> 0.317856).  Saving model ...
Validation loss decreased (0.317856 --> 0.316493).  Saving model ...
Validation loss decreased (0.316493 --> 0.315126).  Saving model ...
Validation loss decreased (0.315126 --> 0.313763).  Saving model ...
Validation loss decreased (0.313763 --> 0.312431).  Saving model ...
Validation loss decreased (0.312431 --> 0.311132).  Saving model ...
Validation loss decreased (0.311132 --> 0.309839).  Saving model ...
Validation loss decreased (0.309839 --> 0.308547).  Saving model ...
Epoch: 450 	Training Loss: 0.346954 	Validation Loss: 0.307257
Validation loss decreased (0.308547 --> 0.307257).  Saving model ...
Validation loss decreased (0.307257 --> 0.305963).  Saving model ...
Validation loss decreased (0.305963 --> 0.304668).  Saving model ...
Validation loss decreased (0.304668 --> 0.303373).  Saving model ...
Validation loss decreased (0.303373 --> 

In [15]:
model_A1.load_state_dict(torch.load('model_A1.pt'))

<All keys matched successfully>

In [16]:
test_tensor =  torch.tensor(testX).type('torch.FloatTensor')
model_A1.eval()
out = model_A1(test_tensor)
out = out.detach().numpy()


In [17]:
df_out = pd.DataFrame()
df_out['pred'] = out[:,0]
df_out['actual'] = testY[:,0]
#df_out.index = ts_data.index[train_percent + valid_percent:len(ts_data)-w-pred_window]

df_out.tail()

Unnamed: 0,pred,actual
826,0.76194,2.354444
827,0.775693,2.680556
828,0.795029,3.063889
829,0.816994,2.462778
830,0.83121,1.616667


In [18]:
df_out['error'] = np.abs(df_out['pred'] - df_out['actual'])
df_out['error_n'] = (df_out['error'] - df_out['error'].mean())/df_out['error'].std()
df_out.index = ts_data.index[train_percent + valid_percent +w+pred_window-1:-1]

In [19]:
thresh = df_out.loc[df_out['error_n'].abs() > 3]
thresh

Unnamed: 0_level_0,pred,actual,error,error_n
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
844,0.713803,3.982222,3.268419,6.339405
1208,0.897285,5.646111,4.748826,9.482074
1209,0.891247,10.452778,9.561531,19.698676
1210,0.9268,4.237778,3.310978,6.429751
1211,1.026066,3.192222,2.166157,3.999479
1457,0.775693,2.680556,1.904862,3.444793
1458,0.795029,3.063889,2.26886,4.217502


In [20]:
positives = ts_data.loc[df_out.index].loc[ts_data.is_anomaly == 1].index
negatives = ts_data.loc[df_out.index].loc[ts_data.is_anomaly == 0].index
tp = []
fn = []
fp = []
tn = []
for p in positives:
    if p in thresh.index:
        tp.append(p)
    else:
        fn.append(p)

for n in negatives:
    if n in thresh.index:
        fp.append(n)
    else:
        tn.append(n)

In [21]:
recall = len(tp)/(len(tp)+len(fn))
precision = len(tp)/(len(tp)+len(fp))
F_score = 2* recall*precision/(recall + precision)
F_score

0.7777777777777778