# Manual single tests

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pickle as pickle
import pandas as pd
import numpy as np

import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset

from src.trainer import CaseDataSet
from src.model import DLModels
from src.trainer import Trainer

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import importlib.util
torch_device = "cpu"
device_package = torch.cpu
if importlib.util.find_spec("torch.backends.mps") is not None:
    if torch.backends.mps.is_available():
        torch_device = torch.device("mps")
        device_package = torch.mps
if torch.cuda.is_available():
    torch_device = torch.device("cuda")
    device_package = torch.cuda
    
torch_device

device(type='cuda')

In [6]:
def eval_model(model, test_set, torch_device, device_package, decision_boundary=0.5,
               weighted=True, print_res=False):
    model.flatten()
    res, ref, num = evaluate_model(model, test_set, torch_device, device_package)
    res_prob = np.squeeze(torch.concat(res).numpy())
    res_class = copy.copy(res_prob)
    res_class[res_class < decision_boundary] = 0
    res_class[res_class >= decision_boundary] = 1
    ref_class = np.squeeze(torch.concat(ref).numpy()).astype(int)
    roc_auc = roc_auc_score(ref_class, res_prob)
    f1 = f1_score(ref_class, res_class)
    f1_inverse = f1_score(1-ref_class, 1-res_class)
    precision = precision_score(ref_class, res_class)
    precision_inverse = precision_score(1-ref_class, 1-res_class)
    recall = recall_score(ref_class, res_class)
    recall_inverse = recall_score(1-ref_class, 1-res_class)
    if print_res:
        print("roc_auc: ", roc_auc)
        print("f1: ", f1)
        print("f1 inverse: ", f1_inverse)
        print("Precision: ", precision)
        print("Precision inverse: ", precision_inverse)
        print("Recall: ", recall)
        print("Recall inverse: ", recall_inverse)

    if weighted:
        
    else:
        return roc_auc, f1, f1_inverse, precision, precision_inverse, recall, recall_inverse

## Random split w2v with time feature

In [2]:
source_train = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_train_random_all",
                                       embedding_version="_w2v", earliness_requirement=True)
source_val = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_val_random_all",
                                       embedding_version="_w2v", earliness_requirement=True)
source_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_test_random_all",
                                       embedding_version="_w2v", earliness_requirement=True)
target_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_test_random_all",
                                       embedding_version="_w2v", earliness_requirement=True)

In [3]:
# Hyperparameters
input_size = 51  # The number of expected features in the input x
hidden_size = 128  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model, train_loss, val_loss = Trainer.train_model(model, optimizer, None, None, source_train, source_val, batch_size,
                                                  torch_device, device_package, eval_func=Trainer.prefix_weighted_loss,
                                                  max_epoch=100, max_ob_iter=20, score_margin=1, print_iter=True)

Finished training iteration:  0  with val loss:  996.6249177235654  train loss:  2660.7734584003942
Finished training iteration:  1  with val loss:  1196.3048524362528  train loss:  2589.2944925410757
Finished training iteration:  2  with val loss:  855.5405496474328  train loss:  2562.5796381589416
Finished training iteration:  3  with val loss:  781.9919834990015  train loss:  2532.4193045895504
Finished training iteration:  4  with val loss:  823.1721990579153  train loss:  2592.4140822618606
Finished training iteration:  5  with val loss:  711.3471627569812  train loss:  2525.1250427588975
Finished training iteration:  6  with val loss:  784.2330404241188  train loss:  2521.490577040124
Finished training iteration:  7  with val loss:  700.9396956267996  train loss:  2518.2232184554523
Finished training iteration:  8  with val loss:  695.8953017310303  train loss:  2496.858220715114
Finished training iteration:  9  with val loss:  798.6070163065406  train loss:  2499.077705444793
Fi

In [7]:
eval_model(model, source_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.788782648557422
f1:  0.8300555396120669
f1 inverse:  0.5439637599093998
Precision:  0.8911160245025191
Precision inverse:  0.4594780745389148
Recall:  0.7768263397371082
Recall inverse:  0.66651865008881


(0.788782648557422,
 0.8300555396120669,
 0.5439637599093998,
 0.8911160245025191,
 0.4594780745389148,
 0.7768263397371082,
 0.66651865008881)

In [8]:
eval_model(model, target_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.4230700553058763
f1:  0.6871052202220479
f1 inverse:  0.26430811386816866
Precision:  0.6811446117192795
Precision inverse:  0.2698606120139761
Recall:  0.6931710707138267
Recall inverse:  0.25897949929595954


(0.4230700553058763,
 0.6871052202220479,
 0.26430811386816866,
 0.6811446117192795,
 0.2698606120139761,
 0.6931710707138267,
 0.25897949929595954)

## Temporal split w2v with time feature

In [9]:
source_train = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_train",
                                       embedding_version="_w2v", earliness_requirement=True)
source_val = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_val",
                                       embedding_version="_w2v", earliness_requirement=True)
source_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_test",
                                       embedding_version="_w2v", earliness_requirement=True)
target_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_test",
                                       embedding_version="_w2v", earliness_requirement=True)

In [10]:
# Hyperparameters
input_size = 51  # The number of expected features in the input x
hidden_size = 128  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model, train_loss, val_loss = Trainer.train_model(model, optimizer, None, None, source_train, source_val, batch_size,
                                                  torch_device, device_package, eval_func=Trainer.prefix_weighted_loss,
                                                  max_epoch=100, max_ob_iter=20, score_margin=1, print_iter=True)

Finished training iteration:  0  with val loss:  770.7401489121513  train loss:  2111.58595620398
Finished training iteration:  1  with val loss:  639.5801284197825  train loss:  1977.5015798619638
Finished training iteration:  2  with val loss:  704.881448928058  train loss:  1964.4997446317514
Finished training iteration:  3  with val loss:  668.2876379987465  train loss:  1975.0989536021889
Finished training iteration:  4  with val loss:  614.1776659076711  train loss:  1978.8005905498533
Finished training iteration:  5  with val loss:  795.8914965395888  train loss:  1952.002751499723
Finished training iteration:  6  with val loss:  980.0886866191778  train loss:  1976.1910546374884
Finished training iteration:  7  with val loss:  608.0032991130973  train loss:  2033.2781942557572
Finished training iteration:  8  with val loss:  906.8084920839253  train loss:  1987.85975196129
Finished training iteration:  9  with val loss:  669.5728545660515  train loss:  1981.9269394000075
Finish

In [11]:
eval_model(model, source_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.7438235988684694
f1:  0.8171005259090309
f1 inverse:  0.5225136558240243
Precision:  0.8626124333359866
Precision inverse:  0.4592560553633218
Recall:  0.7761504028648165
Recall inverse:  0.6059810523912795


(0.7438235988684694,
 0.8171005259090309,
 0.5225136558240243,
 0.8626124333359866,
 0.4592560553633218,
 0.7761504028648165,
 0.6059810523912795)

In [12]:
eval_model(model, target_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.644895825098191
f1:  0.641076460933387
f1 inverse:  0.45782850727417324
Precision:  0.8065927001042286
Precision inverse:  0.34949514788462055
Recall:  0.531923504358607
Recall inverse:  0.6634916868633798


(0.644895825098191,
 0.641076460933387,
 0.45782850727417324,
 0.8065927001042286,
 0.34949514788462055,
 0.531923504358607,
 0.6634916868633798)

In [18]:
target_train = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_train",
                                       embedding_version="_st", earliness_requirement=True)
target_val = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_val",
                                       embedding_version="_st", earliness_requirement=True)
target_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_test",
                                       embedding_version="_st", earliness_requirement=True)

In [19]:
# Hyperparameters
input_size = 385  # The number of expected features in the input x
hidden_size = 512  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model, train_loss, val_loss = Trainer.train_model(model, optimizer, None, None, target_train, target_val, batch_size,
                                                  torch_device, device_package, eval_func=Trainer.prefix_weighted_loss,
                                                  max_epoch=100, max_ob_iter=20, score_margin=1, print_iter=True)

Finished training iteration:  0  with val loss:  1988.9160514570456  train loss:  4317.641623615202
Finished training iteration:  1  with val loss:  1682.0884687426717  train loss:  4183.40916337637
Finished training iteration:  2  with val loss:  1820.4668025274987  train loss:  4113.818625999076
Finished training iteration:  3  with val loss:  1666.8854319424315  train loss:  4013.5959644560708
Finished training iteration:  4  with val loss:  1683.3796946949776  train loss:  3947.536043122505
Finished training iteration:  5  with val loss:  1641.8601277235052  train loss:  3958.130681605511
Finished training iteration:  6  with val loss:  1682.2979228832246  train loss:  3898.806011466092
Finished training iteration:  7  with val loss:  1495.3503464992973  train loss:  3921.5271092744792
Finished training iteration:  8  with val loss:  1443.6140393632006  train loss:  3919.21339096277
Finished training iteration:  9  with val loss:  1450.4088309332203  train loss:  3852.6280342132545

In [10]:
model_tt = torch.load("../../Model/LSTM/LSTM_TT_h" + str(hidden_size) + "_l" + str(num_layers) + "_st.LSTM")
eval_model(model_tt, target_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.6366768829624394
f1:  0.7722057032078907
f1 inverse:  0.4388894266447262
Precision:  0.7875927670120171
Precision inverse:  0.4187381674336991
Recall:  0.7574083471982219
Recall inverse:  0.46107825025846144


(0.6366768829624394,
 0.7722057032078907,
 0.4388894266447262,
 0.7875927670120171,
 0.4187381674336991,
 0.7574083471982219,
 0.46107825025846144)

In [17]:
torch.save(model, "../../Model/LSTM/LSTM_TT_h" + str(hidden_size) + "_l" + str(num_layers) + "_st.LSTM")

In [3]:
source_train = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_train",
                                       embedding_version="_st", earliness_requirement=True)
source_val = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_val",
                                       embedding_version="_st", earliness_requirement=True)
source_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_test",
                                       embedding_version="_st", earliness_requirement=True)
target_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_test",
                                       embedding_version="_st", earliness_requirement=True)

In [4]:
# Hyperparameters
input_size = 385  # The number of expected features in the input x
hidden_size = 512  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model, train_loss, val_loss = Trainer.train_model(model, optimizer, None, None, source_train, source_val, batch_size,
                                                  torch_device, device_package, eval_func=Trainer.prefix_weighted_loss,
                                                  max_epoch=100, max_ob_iter=20, score_margin=1, print_iter=True)

Finished training iteration:  0  with val loss:  1039.2492868237623  train loss:  1973.6620985799589
Finished training iteration:  1  with val loss:  730.9840549168484  train loss:  1968.1232016268439
Finished training iteration:  2  with val loss:  1053.200834804691  train loss:  1922.908640485293
Finished training iteration:  3  with val loss:  817.3597401400724  train loss:  1991.4972430789105
Finished training iteration:  4  with val loss:  772.0366282027886  train loss:  1982.9249040802117
Finished training iteration:  5  with val loss:  752.7597583220604  train loss:  1953.9207156335165
Finished training iteration:  6  with val loss:  644.1411491213101  train loss:  1942.0572116555575
Finished training iteration:  7  with val loss:  623.1171479989646  train loss:  1919.0061196916886
Finished training iteration:  8  with val loss:  736.5424007116064  train loss:  1903.243047325431
Finished training iteration:  9  with val loss:  918.3865771184614  train loss:  1915.2791136717524
F

In [7]:
eval_model(model, target_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.5778218223090072
f1:  0.6724835690242856
f1 inverse:  0.37357791692389103
Precision:  0.7507761269680362
Precision inverse:  0.31145649518121354
Recall:  0.6089780238840103
Recall inverse:  0.46665423792010574


(0.5778218223090072,
 0.6724835690242856,
 0.37357791692389103,
 0.7507761269680362,
 0.31145649518121354,
 0.6089780238840103,
 0.46665423792010574)

In [8]:
eval_model(model, source_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.7069402210516883
f1:  0.8501752076700048
f1 inverse:  0.5089488719174489
Precision:  0.8446313706086096
Precision inverse:  0.5201382268827455
Recall:  0.8557923008057297
Recall inverse:  0.4982307955712818


(0.7069402210516883,
 0.8501752076700048,
 0.5089488719174489,
 0.8446313706086096,
 0.5201382268827455,
 0.8557923008057297,
 0.4982307955712818)

## One-hot encoding

In [2]:
target_train = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_train",
                                       embedding_version="_onehot", earliness_requirement=True)
target_val = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_val",
                                       embedding_version="_onehot", earliness_requirement=True)
target_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_test",
                                       embedding_version="_onehot", earliness_requirement=True)

In [4]:
# Hyperparameters
input_size = 48  # The number of expected features in the input x
hidden_size = 128  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model, train_loss, val_loss = Trainer.train_model(model, optimizer, None, None, target_train, target_val, batch_size,
                                                  torch_device, device_package, eval_func=Trainer.prefix_weighted_loss,
                                                  max_epoch=100, max_ob_iter=20, score_margin=1, print_iter=True)

Finished training iteration:  0  with val loss:  2170.1126981360208  train loss:  4352.60548912944
Finished training iteration:  1  with val loss:  1810.327537026345  train loss:  4428.924114136297
Finished training iteration:  2  with val loss:  1778.3200395160427  train loss:  4340.564020028704
Finished training iteration:  3  with val loss:  1893.6346498465555  train loss:  4355.280440942198
Finished training iteration:  4  with val loss:  1539.5768292573628  train loss:  4387.240193306612
Finished training iteration:  5  with val loss:  2661.6335987820103  train loss:  4314.496769521407
Finished training iteration:  6  with val loss:  1591.3304440736624  train loss:  4425.871598136476
Finished training iteration:  7  with val loss:  1752.8828915616607  train loss:  4347.596576816226
Finished training iteration:  8  with val loss:  1605.7442369268829  train loss:  4283.1803461094805
Finished training iteration:  9  with val loss:  2547.0448565741112  train loss:  4293.4640021719515


In [7]:
eval_model(model, target_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.6500524453131455
f1:  0.7800741615553607
f1 inverse:  0.3942250515932702
Precision:  0.7711384078484048
Precision inverse:  0.40722273408848664
Recall:  0.7892194335416813
Recall inverse:  0.3820314221310781


(0.6500524453131455,
 0.7800741615553607,
 0.3942250515932702,
 0.7711384078484048,
 0.40722273408848664,
 0.7892194335416813,
 0.3820314221310781)

In [8]:
torch.save(model, "../../Model/LSTM/LSTM_TT_h" + str(hidden_size) + "_l" + str(num_layers) + "_onehot.LSTM")

In [9]:
source_train = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_train",
                                       embedding_version="_onehot", earliness_requirement=True)
source_val = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_val",
                                       embedding_version="_onehot", earliness_requirement=True)
source_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_test",
                                       embedding_version="_onehot", earliness_requirement=True)
target_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_test",
                                       embedding_version="_onehot", earliness_requirement=True)

In [10]:
# Hyperparameters
input_size = 48  # The number of expected features in the input x
hidden_size = 128  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model, train_loss, val_loss = Trainer.train_model(model, optimizer, None, None, source_train, source_val, batch_size,
                                                  torch_device, device_package, eval_func=Trainer.prefix_weighted_loss,
                                                  max_epoch=100, max_ob_iter=20, score_margin=1, print_iter=True)

Finished training iteration:  0  with val loss:  678.9203514885486  train loss:  2234.2490897290368
Finished training iteration:  1  with val loss:  920.4566444249734  train loss:  2112.3043458266047
Finished training iteration:  2  with val loss:  662.4384308643331  train loss:  2339.2357575995634
Finished training iteration:  3  with val loss:  608.767094538989  train loss:  2131.1151220662746
Finished training iteration:  4  with val loss:  625.6503372921335  train loss:  2071.4123074762074
Finished training iteration:  5  with val loss:  579.6395537601221  train loss:  2063.8861011068657
Finished training iteration:  6  with val loss:  637.8031599819273  train loss:  2029.531731900665
Finished training iteration:  7  with val loss:  722.5074808455742  train loss:  2093.4012284026417
Finished training iteration:  8  with val loss:  580.7969552823214  train loss:  2133.9583338149396
Finished training iteration:  9  with val loss:  818.0411145042092  train loss:  1994.4771580714441
Fi

In [11]:
eval_model(model, target_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.5109444243307237
f1:  0.8370700333882969
f1 inverse:  0.06385218470627052
Precision:  0.7287456910507171
Precision inverse:  0.43755383290267014
Recall:  0.9832208082534095
Recall inverse:  0.03443892683422877


(0.5109444243307237,
 0.8370700333882969,
 0.06385218470627052,
 0.7287456910507171,
 0.43755383290267014,
 0.9832208082534095,
 0.03443892683422877)

In [12]:
eval_model(model, source_test, torch_device, device_package, decision_boundary=0.5)

roc_auc:  0.7322679200988101
f1:  0.8396849362414276
f1 inverse:  0.521006576444372
Precision:  0.8532978408754807
Precision inverse:  0.4973023448848309
Recall:  0.8264995523724261
Recall inverse:  0.5470836662481452


(0.7322679200988101,
 0.8396849362414276,
 0.521006576444372,
 0.8532978408754807,
 0.4973023448848309,
 0.8264995523724261,
 0.5470836662481452)