## Applying embedding layer

Let's apply embedding layer before linear layers


In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.utils as torch_utils
import pandas as pd
from torch.utils.data import DataLoader,Dataset,WeightedRandomSampler
from torchvision import transforms
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

os.environ["CUDA_VISIBLE_DEVICES"]="0"
# !nvidia-smi

## Construct custom dataloader

In [3]:
class Customdataset(Dataset):
    def __init__(self,x_data,y_data,transform=None):
        '''
        Call stored dataset
        
        Params
        second: Ellapsed second from the beginning of events 
        encoding_type: Encoding method for outcomeprediction ex) Static, last_state, aggregation, etc.
        '''
        
            
        # Transforms
        self.y_data=np.array(y_data)
        self.x_data=x_data
    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self,idx):
        
        # Convert x and y data to torch flaot tensor
        x = self.x_data[idx]
        y = self.y_data[idx]
        return x,y

To equalize size of each input data, delete columns that doesn't exist across all cases

In [4]:
encoding_type = 'indexbase'
prefix = 5
input_data = pd.read_csv('../data/bpic2011/'+encoding_type+'_prefix'+str(prefix)+'.csv')
input_data =input_data.drop(['Case ID'],axis=1)

y_data = [int(y) for y in list(input_data['Label'])]
input_data = input_data.drop(['Label'],axis=1)

x_data = input_data


So,  
Event log: BPIC 2011  
prefix length: 5  
Number of event categorical attributes: 8  
Number of event categorical attributes columns in pre-preprocessed: 839
Number of event continuous attributes: 2  
Number of items in single event (including embedding_dim_size,5): 10 (42,∵ 8\*5+2)  

In [5]:
event_cat =['Activity','Section','Specialism code','Producer code','org:group','Timemonth','Timeweekday','Timehour']
event_con =['Duration','Cumduration']
event_cat_col=[]
event_con_col=[]

for col in x_data.columns.values:
    for e_col in event_cat:
        if e_col in col:
            event_cat_col.append(col)
    for e_col in event_con:
        if e_col in col:
            event_con_col.append(col)
        
only_event_cat = x_data.loc[:,event_cat_col]
vocab = {word: i+2 for i, word in enumerate(event_cat_col)}
vocab['<boc>'] =0
vocab['<eoc>'] =1

embedding_layer = nn.Embedding(num_embeddings = len(vocab), 
                               embedding_dim = 6,
                               padding_idx = 1)

event_cat_col = sorted(event_cat_col,key=lambda x:x.split('_')[1])
event_con_col = sorted(event_con_col,key=lambda x:x.split('_')[1])

nx_data = []
for row in range(len(x_data)):
    row_train=[]
    count=1
    pre_prefix=0
    for pos,col in enumerate(event_cat_col):
        if x_data.loc[row,col] ==1:
            row_train.append(vocab[col])
            if pre_prefix != col.split('_')[1]:
                pre_prefix=col.split('_')[1]
                count =1
            else:
                count +=1
    for pos,col in enumerate(event_con_col):
        row_train.append(x_data.loc[row,col])
    row_train = torch.tensor(row_train, dtype=torch.float)
    nx_data.append(row_train)
    
x_train, x_test, y_train, y_test = train_test_split(nx_data, y_data, test_size=0.33, random_state=69)

In [6]:
trainset = Customdataset(x_train,y_train)
testset = Customdataset(x_test,y_test)

In [7]:
batch_size =10
train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(testset,batch_size=1)

In [8]:
class MLP_prediction(nn.Module):
    def __init__(self,vocab_size, embedding_dim):
        super(MLP_prediction,self).__init__()
        
        self.embeddings = nn.Embedding(len(vocab), embedding_dim)
        self.linear1 = nn.Linear(5*50,5*50)
        self.linear2 = nn.Linear(5*50, 1)
        
        # MLP part
        
        self.dropout = nn.Dropout(p=0.1)
        self.relu = nn.LeakyReLU()
        self.sig = nn.Sigmoid()
        
    
    def forward(self, inputs):
        """
        implement code here
        """
        l3 =[]
        for case in inputs:
            con = case[-10:]
            cat = case[:-10].long()
            l1 = self.embeddings(cat).view(-1,48)
            rearrange=[]
            for pos,k in enumerate(l1):  
                t = torch.cat((k,con[pos*2:pos*2+2]))
                rearrange.append(t)
            l2 = torch.cat(rearrange).view(1,-1)
            l3.append(l2)
        hidden = torch.cat(l3)
        hidden = self.linear1(hidden)
        hidden = self.relu(hidden)
        hidden = self.dropout(hidden)
        hidden = self.linear2(hidden)
        outputs = hidden.squeeze(1)
        return outputs


embedding_dim=6
model = MLP_prediction(len(vocab),embedding_dim = embedding_dim).cuda()

# Loss function & Optimizers
"""
you can change the loss and optimizer
"""
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)#, weight_decay=1e-4)



# Hyperparameters
"""
you can change the value
"""
num_epochs = 100
batch_size = 10

In [9]:
def binary_acc(train_predict, train_y):
    train_predict_tag = torch.round(torch.sigmoid(train_predict))
    correct_results_sum = (train_predict_tag == train_y).sum().float()
    acc = correct_results_sum/train_y.shape[0]
    acc = torch.round(acc *100)
    
    return acc    

In [10]:
# Train CNN_prediction first

accuracy_graph = {'train':[], 'test':[], 'epoch': []}
loss_graph = {'train':[], 'test':[], 'epoch': []}
# model = MLP_prediction().cuda()
best_loss=10
for epoch in range(num_epochs):
    epoch_loss = 0
    epoch_acc = 0
    # Training
    for train_x, train_y in train_loader: 
#         train_y = train_y.squeeze(1)
        train_x,train_y = train_x.cuda(), train_y.cuda()
        train_predict = model(train_x)
        train_predict = train_predict.float()
        train_y = train_y.float()
        loss = criterion(train_predict, train_y)
        
        acc = binary_acc(train_predict, train_y)

        # Backpropagation        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    # Evaluation
    if epoch % 5 ==0:
        
        test_acc = 0
        test_loss =0
        
        for test_x, test_y in test_loader:

            with torch.autograd.no_grad():
                test_x, test_y = test_x.cuda(), test_y.cuda()
                test_predict = model(test_x)
            test_predict = test_predict.float()
            test_y = test_y.float()

            loss = criterion(test_predict, test_y)
            acc = binary_acc(test_predict, test_y)
            
            test_loss += loss.item()
            test_acc += acc.item()
                               
        print("\ntrain accuracy: {:.2f} %, test accuracy: {:.2f} %".format(epoch_acc/len(train_loader), test_acc/len(test_loader)))
        print("epoch:{}, train_loss: {:.4f}, test_loss: {:.4f}".format(epoch+1, epoch_loss/len(train_loader), test_loss/len(test_loader))) 
        accuracy_graph['epoch'] = epoch+1
        accuracy_graph['train'] = epoch_acc/len(train_loader)

        loss_graph['epoch'].append(epoch+1)
        loss_graph['train'].append(epoch_loss/len(train_loader))
        loss_graph['test'].append(test_loss/len(test_loader))
    
        if test_loss/len(test_loader) < best_loss:
            best_loss = test_loss/len(test_loader)
            print('Saving model')
            torch.save(model.state_dict(), "./embed_predic.pt")


train accuracy: 65.62 %, test accuracy: 69.77 %
epoch:1, train_loss: 0.6234, test_loss: 0.5929
Saving model

train accuracy: 75.78 %, test accuracy: 71.06 %
epoch:6, train_loss: 0.5111, test_loss: 0.5821
Saving model

train accuracy: 81.41 %, test accuracy: 70.74 %
epoch:11, train_loss: 0.4402, test_loss: 0.5908

train accuracy: 85.94 %, test accuracy: 68.49 %
epoch:16, train_loss: 0.3645, test_loss: 0.6002

train accuracy: 90.00 %, test accuracy: 66.56 %
epoch:21, train_loss: 0.2892, test_loss: 0.6317

train accuracy: 92.50 %, test accuracy: 68.81 %
epoch:26, train_loss: 0.2383, test_loss: 0.6695

train accuracy: 96.56 %, test accuracy: 67.20 %
epoch:31, train_loss: 0.1811, test_loss: 0.7052

train accuracy: 97.19 %, test accuracy: 66.56 %
epoch:36, train_loss: 0.1500, test_loss: 0.7434

train accuracy: 98.28 %, test accuracy: 68.17 %
epoch:41, train_loss: 0.1199, test_loss: 0.7929

train accuracy: 99.22 %, test accuracy: 68.17 %
epoch:46, train_loss: 0.0901, test_loss: 0.8289

train

In [11]:
test_acc = 0
test_loss =0
y_predict_list=[]
device = torch.device("cuda")
model = MLP_prediction(len(vocab),embedding_dim = embedding_dim).cuda()
model.load_state_dict(torch.load("./embed_predic.pt", map_location="cuda:0"))
model.to(device)
model.eval()

testset = Customdataset(x_test,y_test)
test_loader = DataLoader(testset,batch_size=1)

with torch.autograd.no_grad():
    for test_x, test_y in test_loader:
        
        test_x, test_y = test_x.cuda(), test_y.cuda()
        test_predict = model(test_x)
        test_predict = test_predict.float()
        test_y = test_y.float()
        
        test_predict_tag = torch.round(torch.sigmoid(test_predict))
        y_predict_list.append(test_predict_tag.cpu().numpy())
        
print(classification_report(y_test,y_predict_list))

              precision    recall  f1-score   support

           0       0.74      0.64      0.69       154
           1       0.69      0.78      0.73       157

    accuracy                           0.71       311
   macro avg       0.72      0.71      0.71       311
weighted avg       0.72      0.71      0.71       311

