# Deep learning model trainer

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pickle as pickle
import pandas as pd
import numpy as np

import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset

from src.trainer import CaseDataSet
from src.model import DLModels

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import importlib.util
torch_device = "cpu"
device_package = torch.cpu
if importlib.util.find_spec("torch.backends.mps") is not None:
    if torch.backends.mps.is_available():
        torch_device = torch.device("mps")
        device_package = torch.mps
if torch.cuda.is_available():
    torch_device = torch.device("cuda")
    device_package = torch.cuda
    
torch_device

device(type='cuda')

In [2]:
def prefix_weighted_loss(loss_list, sample_num):
    return np.sum(loss_list * sample_num) / np.sum(sample_num)

def BCE_counter_imblance(y):
    class_weight = 1
    if (1-y).sum() * y.sum() > 0:
        class_weight = ((1-y).sum()/y.sum())
    pos_weight = torch.ones(1).to(torch_device) * class_weight
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    return criterion


def forward_model_batch(model, x, y, optimizer, criterion, loss_prefix, training=True):
    if criterion is None:
        criterion = BCE_counter_imblance(y)
    outputs = model(x)
    # Backward and optimize
    if training:
        loss = criterion(outputs, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    else:
        loss = criterion(outputs, y)
    loss_prefix = loss_prefix + loss.item()*x.shape[0]
    return loss_prefix
    
    
def train_model_epoch(model, training_set, optimizer, criterion, torch_device, batch_size=50, training=True):
    training_data_set = training_set
    batch_size = batch_size
    loss_prefix_list = []
    sample_num_list = []
    for prefix_len in range(1, training_data_set.max_case_len):
        loss_prefix = 0
        training_data_set.set_prefix_length(prefix_len)
        training_data_set.shuffle_data()
        input_data = training_data_set[:]
        if input_data is None:
            # print("Max length reached, abort")
            break
        sample_num = input_data[0].shape[0]
        # print("Starting training at prefix length: ", prefix_len, " with sample num: ", sample_num)
        sample_num_list.append(sample_num)

        batch_num = int(sample_num / batch_size)
        for i in range(batch_num):
            x = input_data[0][int(batch_size * i) : int(batch_size * (i+1))].float().to(torch_device)
            y = input_data[1][int(batch_size * i) : int(batch_size * (i+1))].float().to(torch_device)
            loss_prefix = forward_model_batch(model, x, y, optimizer, criterion, loss_prefix, training=True)

        if sample_num > batch_size * batch_num:
            x = input_data[0][batch_size * batch_num :].float().to(torch_device)
            y = input_data[1][batch_size * batch_num :].float().to(torch_device)
            loss_prefix = forward_model_batch(model, x, y, optimizer, criterion, loss_prefix, training=True)

        loss_prefix_list.append(loss_prefix)
    return np.array(loss_prefix_list), np.array(sample_num_list)   


def train_model(model, optimizer, criterion, criterion_eval, training_set, val_set,
                batch_size, torch_device, device_package, eval_func=prefix_weighted_loss,
                max_epoch=100, max_ob_iter=20, score_margin=1e-4, print_iter=False):
    train_loss_list = []
    val_loss_list = []
    score = 1e5
    best_iter = 0
    best_model = None
    for iter_epoch in range(max_epoch):
        device_package.empty_cache()
        loss_train, sample_num_train = train_model_epoch(model, training_set, batch_size=batch_size, optimizer=optimizer,
                                                         criterion=criterion, torch_device=torch_device)
        device_package.empty_cache()
        loss_val, sample_num_val = train_model_epoch(model, val_set, batch_size=batch_size, optimizer=optimizer,
                                                     criterion=criterion_eval, torch_device=torch_device, training=False)

        score_train = eval_func(loss_train, sample_num_train)
        score_val = eval_func(loss_val, sample_num_val)
        train_loss_list.append(score_train)
        val_loss_list.append(score_val)

        if score_val < (score - score_margin):
            score = score_val
            best_model = copy.deepcopy(model)
            best_iter = iter_epoch

        if iter_epoch > best_iter + max_ob_iter:
            break
        if print_iter:
            print("Finished training iteration: ", iter_epoch, " with val loss: ", score_val, " train loss: ", score_train)
    device_package.empty_cache()
    return best_model, np.array(train_loss_list), np.array(val_loss_list)

In [3]:
def evaluate_model(model, test_set, torch_device, device_package):
    res_list = []
    ref_list = []
    sample_num_list = []
    device_package.empty_cache()
    for prefix_len in range(1, test_set.max_case_len):
        test_set.set_prefix_length(prefix_len)
        input_data = test_set[:]
        if input_data is None:
            # print("Max length reached, abort")
            break
        sample_num = input_data[0].shape[0]
        sample_num_list.append(sample_num)
        x = input_data[0].float().to(torch_device)
        y = input_data[1].float().to(torch_device)
        ref_list.append(y.cpu())
        outputs = model(x)
        prob = torch.sigmoid(outputs).detach().cpu()
        res_list.append(prob)
        
    device_package.empty_cache()
    return res_list, ref_list, sample_num_list

In [4]:
source_train = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_train_random_all",
                                       embedding_version="_w2v", earliness_requirement=True)
source_val = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_val_random_all",
                                       embedding_version="_w2v", earliness_requirement=True)
source_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_test_random_all",
                                       embedding_version="_w2v", earliness_requirement=True)
target_test = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_test_random_all",
                                       embedding_version="_w2v", earliness_requirement=True)

In [9]:
# Hyperparameters
input_size = 51  # The number of expected features in the input x
hidden_size = 128  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
# model = TransformerEncoderModel(43, 64, 8, 12, True).to(torch_device)
# Define the loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [9]:
model, train_loss, val_loss = train_model(model, optimizer, None, None, source_train, source_val, batch_size,
                                          torch_device, device_package, eval_func=prefix_weighted_loss,
                                          max_epoch=50, max_ob_iter=20, score_margin=1e-4, print_iter=True)

Finished training iteration:  0  with val loss:  1454.1883904547942  train loss:  2663.568985870707
Finished training iteration:  1  with val loss:  1165.9653913483737  train loss:  2547.429804242685
Finished training iteration:  2  with val loss:  1298.7271898505048  train loss:  2661.9006989695536
Finished training iteration:  3  with val loss:  1183.2093943849352  train loss:  2807.2031783791126
Finished training iteration:  4  with val loss:  1375.2468248252094  train loss:  2730.1771333482816
Finished training iteration:  5  with val loss:  1286.4815297279433  train loss:  2781.6547341560045
Finished training iteration:  6  with val loss:  1300.7628156539072  train loss:  2760.6526457973646
Finished training iteration:  7  with val loss:  1458.2020619033835  train loss:  2747.6933279300806
Finished training iteration:  8  with val loss:  1418.5225154881746  train loss:  2653.8402600322347
Finished training iteration:  9  with val loss:  1350.0336661204046  train loss:  2763.353545

In [14]:
model.flatten()
res, ref, num = evaluate_model(model, source_test, torch_device, device_package)
res_p = np.squeeze(torch.concat(res).numpy())
res_c = copy.copy(res_p)
res_c[res_c < 0.5] = 0
res_c[res_c >= 0.5] = 1
ref_p = np.squeeze(torch.concat(ref).numpy()).astype(int)
print("roc_auc: ", roc_auc_score(ref_p, res_p))
print("roc_auc inverse: ", roc_auc_score(1-ref_p, 1-res_p))
print("f1: ", f1_score(ref_p, res_c))
print("f1 inverse: ", f1_score(1-ref_p, 1-res_c))
print("Precision: ", precision_score(ref_p, res_c))
print("Precision inverse: ", precision_score(1-ref_p, 1-res_c))
print("Recall: ", recall_score(ref_p, res_c))
print("Recall inverse: ", recall_score(1-ref_p, 1-res_c))

roc_auc:  0.7885650802533464
roc_auc inverse:  0.7885650802533464
f1:  0.8829895112833165
f1 inverse:  0.49274981956564534
Precision:  0.8473985416727188
Precision inverse:  0.602438633081983
Recall:  0.921701213346815
Recall inverse:  0.4168516873889876


In [13]:
model.flatten()
res, ref, num = evaluate_model(model, target_test, torch_device, device_package)
res_p = np.squeeze(torch.concat(res).numpy())
res_c = copy.copy(res_p)
res_c[res_c < 0.5] = 0
res_c[res_c >= 0.5] = 1
ref_p = np.squeeze(torch.concat(ref).numpy()).astype(int)
print("roc_auc: ", roc_auc_score(ref_p, res_p))
print("roc_auc inverse: ", roc_auc_score(1-ref_p, 1-res_p))
print("f1: ", f1_score(ref_p, res_c))
print("f1 inverse: ", f1_score(1-ref_p, 1-res_c))
print("Precision: ", precision_score(ref_p, res_c))
print("Precision inverse: ", precision_score(1-ref_p, 1-res_c))
print("Recall: ", recall_score(ref_p, res_c))
print("Recall inverse: ", recall_score(1-ref_p, 1-res_c))

roc_auc:  0.5657107246633903
roc_auc inverse:  0.5657108244902032
f1:  0.7263002197052436
f1 inverse:  0.3176587817896788
Precision:  0.7081897707301089
Precision inverse:  0.3392898301178193
Recall:  0.7453612503069856
Recall inverse:  0.29862055798190973


In [10]:
source_train_st = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_train",
                                       embedding_version="_st", time_feature=False, earliness_requirement=True)
source_val_st = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_val",
                                       embedding_version="_st", time_feature=False, earliness_requirement=True)
source_test_st = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source", data_version="_test",
                                       embedding_version="_st", time_feature=False, earliness_requirement=True)
target_test_st = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target", data_version="_test",
                                       embedding_version="_st", time_feature=False, earliness_requirement=True)

In [6]:
# Hyperparameters
input_size = 384  # The number of expected features in the input x
hidden_size = 512  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
# model = TransformerEncoderModel(43, 64, 8, 12, True).to(torch_device)
# Define the loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model, train_loss, val_loss = train_model(model, optimizer, None, None, source_train_st, source_val_st, batch_size,
                                          torch_device, device_package, eval_func=prefix_weighted_loss,
                                          max_epoch=100, max_ob_iter=20, score_margin=1, print_iter=True)

Finished training iteration:  0  with val loss:  994.882828364117  train loss:  1988.270744518225
Finished training iteration:  1  with val loss:  1009.9968720959209  train loss:  2012.4854504956372
Finished training iteration:  2  with val loss:  1126.104799346354  train loss:  2105.8544062760798
Finished training iteration:  3  with val loss:  986.4861330248463  train loss:  2062.4674202035994
Finished training iteration:  4  with val loss:  1007.3421818100113  train loss:  2116.763931458579
Finished training iteration:  5  with val loss:  992.2425909738512  train loss:  2138.943605089617
Finished training iteration:  6  with val loss:  955.7647128416767  train loss:  2050.9211195979947
Finished training iteration:  7  with val loss:  942.04925483356  train loss:  2017.2175402620571
Finished training iteration:  8  with val loss:  973.3445737172824  train loss:  2056.1870024174227
Finished training iteration:  9  with val loss:  955.067083382819  train loss:  2162.64045098369
Finishe

In [7]:
model.flatten()
res, ref, num = evaluate_model(model, source_test_st, torch_device, device_package)
res_p = np.squeeze(torch.concat(res).numpy())
res_c = copy.copy(res_p)
res_c[res_c < 0.5] = 0
res_c[res_c >= 0.5] = 1
ref_p = np.squeeze(torch.concat(ref).numpy()).astype(int)
print("roc_auc: ", roc_auc_score(ref_p, res_p))
print("f1: ", f1_score(ref_p, res_c))
print("f1 inverse: ", f1_score(1-ref_p, 1-res_c))
print("Precision: ", precision_score(ref_p, res_c))
print("Precision inverse: ", precision_score(1-ref_p, 1-res_c))
print("Recall: ", recall_score(ref_p, res_c))
print("Recall inverse: ", recall_score(1-ref_p, 1-res_c))

roc_auc:  0.6996676141731326
f1:  0.866321506040311
f1 inverse:  0.1733085283462254
Precision:  0.7764659136061759
Precision inverse:  0.609504132231405
Recall:  0.9796956132497762
Recall inverse:  0.10101586576874785


In [8]:
model.flatten()
res, ref, num = evaluate_model(model, target_test_st, torch_device, device_package)
res_p = np.squeeze(torch.concat(res).numpy())
res_c = copy.copy(res_p)
res_c[res_c < 0.5] = 0
res_c[res_c >= 0.5] = 1
ref_p = np.squeeze(torch.concat(ref).numpy()).astype(int)
print("roc_auc: ", roc_auc_score(ref_p, res_p))
print("f1: ", f1_score(ref_p, res_c))
print("f1 inverse: ", f1_score(1-ref_p, 1-res_c))
print("Precision: ", precision_score(ref_p, res_c))
print("Precision inverse: ", precision_score(1-ref_p, 1-res_c))
print("Recall: ", recall_score(ref_p, res_c))
print("Recall inverse: ", recall_score(1-ref_p, 1-res_c))

roc_auc:  0.4622537910723212
f1:  0.8357323795131602
f1 inverse:  0.015988277434060567
Precision:  0.7243315822948811
Precision inverse:  0.20314439387670666
Recall:  0.9876275944471924
Recall inverse:  0.008321610765554294


In [47]:
res_c[res_c == 0].shape

(7199,)

In [48]:
ref_p[ref_p == 0].shape

(59003,)

In [23]:
res_c.shape

(275179,)