# Evaluation

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pickle as pickle
import pandas as pd
import numpy as np

import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset

from src.trainer import CaseDataSet
from src.model import DLModels
from src.trainer import Trainer
from src.utils import TorchUtils

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [15]:
def eval_model(model, test_set, torch_device, device_package, decision_boundary=0.5,
               weighted=True, print_res=False):
    model.flatten()
    res, ref, num = Trainer.evaluate_model(model, test_set, torch_device, device_package)
    res_prob = np.squeeze(torch.concat(res).numpy())
    res_class = copy.copy(res_prob)
    res_class[res_class < decision_boundary] = 0
    res_class[res_class >= decision_boundary] = 1
    ref_class = np.squeeze(torch.concat(ref).numpy()).astype(int)
    roc_auc = roc_auc_score(ref_class, res_prob)
    f1 = f1_score(ref_class, res_class)
    f1_inverse = f1_score(1-ref_class, 1-res_class)
    precision = precision_score(ref_class, res_class)
    precision_inverse = precision_score(1-ref_class, 1-res_class)
    recall = recall_score(ref_class, res_class)
    recall_inverse = recall_score(1-ref_class, 1-res_class)
    if print_res:
        print("roc_auc: ", roc_auc)
        print("f1: ", f1)
        print("f1 inverse: ", f1_inverse)
        print("Precision: ", precision)
        print("Precision inverse: ", precision_inverse)
        print("Recall: ", recall)
        print("Recall inverse: ", recall_inverse)

    if weighted:
        total_class_num = ref_class.shape[0]
        pos_class_num = ref_class.sum()
        neg_class_num = total_class_num - ref_class.sum()
        weighted_f1 = (f1*pos_class_num + f1_inverse*neg_class_num) / total_class_num
        weighted_precision = (precision*pos_class_num + precision_inverse*neg_class_num) / total_class_num
        weighted_recall = (recall*pos_class_num + recall_inverse*neg_class_num) / total_class_num
        print("roc_auc: ", roc_auc)
        print("weighted f1: ", weighted_f1)
        print("weighted_precision: ", weighted_precision)
        print("weighted_recall: ", weighted_recall)
        return [roc_auc, weighted_f1, weighted_precision, weighted_recall]
    else:
        return [roc_auc, f1, f1_inverse, precision, precision_inverse, recall, recall_inverse]

In [168]:
target_test_set = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target",
                                  data_version="_test_sorted", embedding_version="_st",
                                  earliness_requirement=True)
source_test_set = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="source",
                                  data_version="_test_sorted", embedding_version="_st",
                                  earliness_requirement=True)

In [180]:
model = torch.load("../../Model/641620split/LSTM/LSTM_T_h512_l1_st_b3.LSTM")

In [201]:
target_test_set = CaseDataSet.CaseDataset(split_pattern="641620split", input_data="target",
                                  data_version="_test_sorted", embedding_version="_st",
                                  earliness_requirement=True)


In [203]:
model = torch.load("../../Model/scale/LSTM/LSTM_T_h512_l1_st_0.01_.LSTM")

In [186]:
torch_device, device_package = TorchUtils.get_torch_device()
eval_model(model, source_test_set, torch_device,
           device_package, decision_boundary=0.5,
           weighted=False, print_res=True)

RuntimeError: input.size(-1) must be equal to input_size. Expected 48, got 385

In [204]:
torch_device, device_package = TorchUtils.get_torch_device()
eval_model(model, target_test_set, torch_device,
           device_package, decision_boundary=0.5,
           weighted=False, print_res=True)

OutOfMemoryError: CUDA out of memory. Tried to allocate 514.00 MiB (GPU 0; 23.66 GiB total capacity; 1.55 GiB already allocated; 438.44 MiB free; 2.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [36]:
test_set = CaseDataSet.CaseDataset(split_pattern="811split", input_data="target",
                                  data_version="_test", embedding_version="_w2v",
                                  earliness_requirement=True)

model = torch.load("../../Model/811split/LSTM/LSTM_S_h128_l1_w2v.LSTM")

torch_device, device_package = TorchUtils.get_torch_device()
eval_model(model, test_set, torch_device,
           device_package, decision_boundary=0.5,
           weighted=False, print_res=True)

roc_auc:  0.5922810364410246
f1:  0.4702839183558161
f1 inverse:  0.4069542183917134
Precision:  0.8654471016376397
Precision inverse:  0.2692941607367099
Recall:  0.322864138867339
Recall inverse:  0.8325380135878356


(0.5922810364410246,
 0.4702839183558161,
 0.4069542183917134,
 0.8654471016376397,
 0.2692941607367099,
 0.322864138867339,
 0.8325380135878356)

## Scale Test Evaluation

In [8]:
earliness_requirement = True
folder_path = "../../Data/Training/scale/"
torch_device, device_package = TorchUtils.get_torch_device()               
onehot_test_set = CaseDataSet.CaseDataset(project_data_path=folder_path, split_pattern="",
                                          input_data="target", data_version="_test_0.1",
                                          embedding_version="_onehot",
                                          earliness_requirement=earliness_requirement)

w2v_test_set = CaseDataSet.CaseDataset(project_data_path=folder_path, split_pattern="",
                                          input_data="target", data_version="_test_0.1",
                                          embedding_version="_w2v",
                                          earliness_requirement=earliness_requirement)

st_test_set = CaseDataSet.CaseDataset(project_data_path=folder_path, split_pattern="",
                                          input_data="target", data_version="_test_0.1",
                                          embedding_version="_st",
                                          earliness_requirement=earliness_requirement)

In [21]:
for i in ["1", "2", "3"]:
    onehot_model = torch.load("../../Model/scale/LSTM/LSTM_T_h512_l1_st_0.01_"+ i +".LSTM")
    
    eval_model(onehot_model, st_test_set,
               torch_device, device_package,
               decision_boundary=0.5, weighted=True, print_res=False)
    print("===============================")

roc_auc:  0.5175080285380986
weighted f1:  0.312198051705805
weighted_precision:  0.7264766759561924
weighted_recall:  0.37061929908785407
roc_auc:  0.5512496656698703
weighted f1:  0.39574774242639305
weighted_precision:  0.6918208893604325
weighted_recall:  0.41569058816400917
roc_auc:  0.5914379810321482
weighted f1:  0.369838378144881
weighted_precision:  0.7091945116261374
weighted_recall:  0.4023323343385427


In [22]:
for i in ["1", "2", "3"]:
    onehot_model = torch.load("../../Model/scale/LSTM/LSTM_T_h512_l1_st_0.05_"+ i +".LSTM")
    
    eval_model(onehot_model, st_test_set,
               torch_device, device_package,
               decision_boundary=0.5, weighted=True, print_res=False)
    print("===============================")

roc_auc:  0.5753998742838414
weighted f1:  0.6227065363623507
weighted_precision:  0.601067508094824
weighted_recall:  0.6795137707470088
roc_auc:  0.578643172315416
weighted f1:  0.618504811563308
weighted_precision:  0.597446957601968
weighted_recall:  0.7011312101198328
roc_auc:  0.5442579729771531
weighted f1:  0.6247718903482914
weighted_precision:  0.6041613393570541
weighted_recall:  0.6752350278957254


In [23]:
for i in ["1", "2", "3"]:
    onehot_model = torch.load("../../Model/scale/LSTM/LSTM_T_h512_l1_st_0.1_"+ i +".LSTM")
    
    eval_model(onehot_model, st_test_set,
               torch_device, device_package,
               decision_boundary=0.5, weighted=True, print_res=False)
    print("===============================")

roc_auc:  0.6242836105797538
weighted f1:  0.45274668460436357
weighted_precision:  0.7102840687353728
weighted_recall:  0.457513202112338
roc_auc:  0.6294831762752706
weighted f1:  0.4526514017268689
weighted_precision:  0.7272652874133252
weighted_recall:  0.460337731706976
roc_auc:  0.61830098603866
weighted f1:  0.47726935907768736
weighted_precision:  0.7068466721410711
weighted_recall:  0.47480528923462706


In [24]:
for i in ["1", "2", "3"]:
    onehot_model = torch.load("../../Model/scale/LSTM/LSTM_T_h512_l1_st_0.5_"+ i +".LSTM")
    
    eval_model(onehot_model, st_test_set,
               torch_device, device_package,
               decision_boundary=0.5, weighted=True, print_res=False)
    print("===============================")

roc_auc:  0.6140875915613313
weighted f1:  0.5302700443381574
weighted_precision:  0.6797786663222483
weighted_recall:  0.5122652634130199
roc_auc:  0.6301521290353005
weighted f1:  0.5756143749195455
weighted_precision:  0.6746347592443788
weighted_recall:  0.5527129000834309
roc_auc:  0.6441502602298125
weighted f1:  0.6513411741836981
weighted_precision:  0.6586725547586753
weighted_recall:  0.6453304373359932


In [25]:
for scale in ["0.01", "0.05", "0.1", "0.5"]:
    for i in ["1", "2", "3"]:
        onehot_model = torch.load("../../Model/scale/LSTM/LSTM_T_h512_l1_st_" + scale + "_"+ i +".LSTM")

        eval_model(onehot_model, st_test_set,
                   torch_device, device_package,
                   decision_boundary=0.5, weighted=True, print_res=False)
        print("===============================")
    print("finished test with scale: ", scale)

roc_auc:  0.5175080285380986
weighted f1:  0.312198051705805
weighted_precision:  0.7264766759561924
weighted_recall:  0.37061929908785407
roc_auc:  0.5512496656698703
weighted f1:  0.39574774242639305
weighted_precision:  0.6918208893604325
weighted_recall:  0.41569058816400917
roc_auc:  0.5914379810321482
weighted f1:  0.369838378144881
weighted_precision:  0.7091945116261374
weighted_recall:  0.4023323343385427
finished test with scale:  0.01
roc_auc:  0.5753998742838414
weighted f1:  0.6227065363623507
weighted_precision:  0.601067508094824
weighted_recall:  0.6795137707470088
roc_auc:  0.578643172315416
weighted f1:  0.618504811563308
weighted_precision:  0.597446957601968
weighted_recall:  0.7011312101198328
roc_auc:  0.5442579729771531
weighted f1:  0.6247718903482914
weighted_precision:  0.6041613393570541
weighted_recall:  0.6752350278957254
finished test with scale:  0.05
roc_auc:  0.6242836105797538
weighted f1:  0.45274668460436357
weighted_precision:  0.7102840687353728
we