# Experiments with temporal sorted data split 

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pickle as pickle
import pandas as pd
import numpy as np

import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset

from src.trainer import CaseDataSet
from src.model import DLModels
from src.trainer import Trainer

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import importlib.util
torch_device = "cpu"
device_package = torch.cpu
if importlib.util.find_spec("torch.backends.mps") is not None:
    if torch.backends.mps.is_available():
        torch_device = torch.device("mps")
        device_package = torch.mps
if torch.cuda.is_available():
    torch_device = torch.device("cuda")
    device_package = torch.cuda
    
torch_device

device(type='cuda')

In [2]:
def train_scale_test_model(datasets_dict, model_hyper_para, trainer_hyper_para,
                       scale, embedding, print_iter=False, file_suffix=""):
    torch_device = "cpu"
    device_package = torch.cpu
    if importlib.util.find_spec("torch.backends.mps") is not None:
        if torch.backends.mps.is_available():
            torch_device = torch.device("mps")
            device_package = torch.mps
    if torch.cuda.is_available():
        torch_device = torch.device("cuda")
        device_package = torch.cuda
        
    current_model_para = model_hyper_para[embedding]
    # Instantiate the model
    model = DLModels.SimpleLSTM(current_model_para["input_size"],
                                current_model_para["hidden_size"],
                                current_model_para["num_layers"],
                                1).to(torch_device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=trainer_hyper_para["learning_rate"])
    
    # Load data sets
    source_data_sets = datasets_dict[scale][embedding]["source"]
    target_data_sets = datasets_dict[scale][embedding]["target"]
    

    
    print(torch_device)
    print("training model with target data")
    model_source, train_loss_source, val_loss_srouce = Trainer.train_model(model, optimizer,
                                                                           None, None,
                                                                           target_data_sets["train"],
                                                                           target_data_sets["val"],
                                                                           trainer_hyper_para["batch_size"],
                                                                           torch_device,
                                                                           device_package,
                                                                           Trainer.prefix_weighted_loss,
                                                                           trainer_hyper_para["max_epoch"],
                                                                           trainer_hyper_para["max_ob_iter"],
                                                                           trainer_hyper_para["score_margin"],
                                                                           print_iter=print_iter)
    
    
    model_name = "LSTM_T_h" + str(current_model_para["hidden_size"]) + "_l" + str(current_model_para["num_layers"]) + "_" + embedding + "_" + scale
    torch.save(model_source, "../../Model/" + split + "/LSTM/"+ model_name + "_" +  file_suffix + ".LSTM")
    
    training_stat = pd.DataFrame(columns=["TrainingLoss", "ValidationLoss"],
                                 data=np.hstack([train_loss_source.reshape((-1, 1)),
                                                 val_loss_srouce.reshape((-1, 1))]))
    training_stat.to_pickle("../../Model/" + split + "/LSTM/"+ model_name + "_" + file_suffix + "_stat.pkl")
    
    
    print("Finished training model with target data")

In [3]:
trainer_hyper_para = {"max_epoch": 100,
                      "max_ob_iter": 20,
                      "score_margin": 1,
                      "num_class": 1,
                      "num_layers": 1,
                      "learning_rate": 1e-3,
                      "batch_size": 4000,
                      "training_loss_func": "CCM",
                      "eval_loss_func": "CCM"}

model_hyper_para = {"w2v": {"input_size": 51,
                            "hidden_size": 128,
                            "num_layers": 1},
                    "st": {"input_size": 385,
                           "hidden_size": 512,
                           "num_layers": 1},
                    "onehot": {"input_size": 48,
                               "hidden_size": 128,
                               "num_layers": 1}}


scale_ratio = ["0.01" ,"0.05", "0.1", "0.5"]
data_set_type = ["source", "target"]
data_set_cat = ["train", "val", "test"]
embedding_type = ["w2v", "st", "onehot"]
dataset_list_index = [split_pattern, embedding_type, data_set_type, data_set_cat]

earliness_requirement = True
folder_path = "../../Data/Training/scale/"

In [None]:
data_sets_list = {}
for scale_num in scale_ratio:
    data_scale_list = {}
    for embedding in embedding_type:
        data_embedding_list = {}
        for d_type in data_set_type:
            data_type_list = {}
            for d_cat in data_set_cat:       
                data_version = "_" + d_cat + scale_num
                embedding_version = "_" + embedding
                case_data_set = CaseDataSet.CaseDataset(split_pattern="",
                                                        input_data=d_type,
                                                        data_version=data_version,
                                                        embedding_version=embedding_version,
                                                        earliness_requirement=earliness_requirement)
                
                data_type_list[d_cat] = case_data_set
            data_embedding_list[d_type] = data_type_list
        data_scale_list[embedding] = data_embedding_list
    data_sets_list[scale_num] = data_split_list

In [63]:
train_source_model(data_sets_list, model_hyper_para, trainer_hyper_para,
                   split="641620split", embedding="w2v", print_iter=True, file_suffix="b4")

cuda
training model with source data


KeyboardInterrupt: 

In [None]:
train_target_model(data_sets_list, model_hyper_para, trainer_hyper_para,
                   split="641620split", embedding="onehot", print_iter=True, file_suffix="b2")

cuda
training model with target data
Finished training iteration:  0  with val loss:  1605.7518292367765  train loss:  4641.883738829023
Finished training iteration:  1  with val loss:  1933.3408428704852  train loss:  4621.871299030701
Finished training iteration:  2  with val loss:  2882.726259362032  train loss:  4748.084772688445
Finished training iteration:  3  with val loss:  2797.034042922885  train loss:  4961.521449720645
Finished training iteration:  4  with val loss:  1851.7911332158924  train loss:  5427.397107133168
Finished training iteration:  5  with val loss:  1483.8515063440937  train loss:  4834.286608756785
Finished training iteration:  6  with val loss:  1919.6636668705642  train loss:  4720.191851888414
Finished training iteration:  7  with val loss:  1605.63913459526  train loss:  4804.7747296750495
Finished training iteration:  8  with val loss:  1535.8343948187937  train loss:  4532.64257838454
Finished training iteration:  9  with val loss:  1645.7882266732217