# Experiments with random split of data

In [7]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pickle as pickle
import pandas as pd
import numpy as np

import copy

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset

from src.trainer import CaseDataSet
from src.model import DLModels
from src.trainer import Trainer

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import importlib.util
torch_device = "cpu"
device_package = torch.cpu
if importlib.util.find_spec("torch.backends.mps") is not None:
    if torch.backends.mps.is_available():
        torch_device = torch.device("mps")
        device_package = torch.mps
if torch.cuda.is_available():
    torch_device = torch.device("cuda")
    device_package = torch.cuda
    
torch_device

device(type='cuda')

In [8]:
split_pattern = ["811split", "641620split"]
data_set_type = ["source", "target"]
data_set_cat = ["train", "val", "test"]
embedding_type = ["w2v", "st", "onehot"]

In [9]:
earliness_requirement = True
folder_path = "../../Data/Training/"
data_sets_list = {}
for s_pattern in split_pattern:
    data_split_list = {}
    for embedding in embedding_type:
        data_embedding_list = {}
        for d_type in data_set_type:
            data_type_list = {}
            for d_cat in data_set_cat:       
           
                data_version = "_" + d_cat + "_random"
                embedding_version = "_" + embedding
                case_data_set = CaseDataSet.CaseDataset(split_pattern=s_pattern,
                                                        input_data=d_type,
                                                        data_version=data_version,
                                                        embedding_version=embedding_version,
                                                        earliness_requirement=earliness_requirement)
                
                data_type_list[d_cat] = case_data_set
            data_embedding_list[d_type] = data_type_list
        data_split_list[embedding] = data_embedding_list
    data_sets_list[s_pattern] = data_split_list

In [10]:
for s_pattern in split_pattern:
    data_set_split = data_sets_list[s_pattern] 
    data_set_embedding = data_set_split["w2v"]
        
        


In [11]:
# Hyperparameters
input_size = 51  # The number of expected features in the input x
hidden_size = 128  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 1  # For binary classification
learning_rate = 0.001
batch_size = 1000

# Instantiate the model
model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model, train_loss, val_loss = Trainer.train_model(model, optimizer, None, None, source_train, source_val, batch_size,
                                          torch_device, device_package, eval_func=prefix_weighted_loss,
                                          max_epoch=50, max_ob_iter=20, score_margin=1e-4, print_iter=True)

{'source': {'train': <src.trainer.CaseDataSet.CaseDataset at 0x15eb041ffa0>,
  'val': <src.trainer.CaseDataSet.CaseDataset at 0x15eb0437310>,
  'test': <src.trainer.CaseDataSet.CaseDataset at 0x15eb0437490>},
 'target': {'train': <src.trainer.CaseDataSet.CaseDataset at 0x15ed0f0b1c0>,
  'val': <src.trainer.CaseDataSet.CaseDataset at 0x15ead266fa0>,
  'test': <src.trainer.CaseDataSet.CaseDataset at 0x15ead27fbe0>}}