In [1]:
root_dir = '/home2/glee/dissertation/1_tech_gen_impact/class2class/Tech_Gen/'
master_dir = '/home2/glee/dissertation/1_tech_gen_impact/master/Tech_Gen/'
import sys
sys.path.append(root_dir)

import copy
import gc
import os
import argparse
import math
import time
import pickle
import re
import multiprocess as mp
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
sys.path.append("/share/tml_package")
from tml import utils
from scipy import io
from tqdm import tqdm
from collections import OrderedDict

import torch
from torch.nn import functional as F
from torch.nn import DataParallel as DP
from torch.utils.data import TensorDataset, DataLoader, Subset, Dataset
from accelerate import Accelerator
import pytorch_model_summary

import optuna
from optuna.samplers import RandomSampler, TPESampler
from optuna.integration import SkoptSampler

import numpy as np
import pandas as pd
import scipy.stats
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from data import TechDataset, CVSampler
from models import Transformer, Predictor
from train_utils import EarlyStopping, perf_eval, objective_cv, build_model, train_model, validate_model_mp
from utils import token2class, DotDict, to_device

from cleantext.sklearn import CleanTransformer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# PART 1: Configuration

In [2]:
args = argparse.Namespace(
    data_type="class",
    data_file = "collection_[H01L,H10][2017].csv",
    target_ipc="H01L",
    pred_type="classification",
    n_TC = 5,
    use_pretrained_tokenizer=False,
    do_train=None,
    do_tune=None,
    n_folds=None,
    batch_size=16,
    max_epochs=50,
    use_accelerator=None,
    do_save=True,
    n_gpus=4,
    light=True,
#     config_file=os.path.join(root_dir, "configs", "USED_configs", "[CONFIGS]2023-04-11_00:39.json"),
    config_file=None,
    eval_train_set=False)

data_dir = os.path.join(master_dir, "data")
model_dir = os.path.join(root_dir, "models")
result_dir = os.path.join(root_dir, "results")
config_dir = os.path.join(root_dir, "configs")

if args.config_file is not None:
    config_file = args.config_file
else:
    config_file = os.path.join(config_dir, "configs_light.json") if args.light else os.path.join(config_dir, "configs.json")
configs = DotDict().load(config_file)
org_config_keys = {key: list(configs[key].keys()) for key in configs.keys()}

instant_configs = {key: value for (key, value) in vars(args).items() if value is not None} # if any argument passed when main.py executed
instant_configs_for_update = {configkey: {key: value for (key,value) in instant_configs.items() if key in org_config_keys[configkey]} for configkey in org_config_keys.keys()}
for key, value in configs.items():
    value.update(instant_configs_for_update[key])

regex_ipc = re.compile('[A-Z](?![\\D])')
if regex_ipc.match(configs.data.target_ipc) is None:
    configs.data.update({"target_ipc": "ALL"})
elif len(configs.data.target_ipc) > 5:
    configs.data.update({"target_ipc": configs.data.target_ipc[:4]})

if configs.model.model_type == "enc-pred-dec":
    configs.train.loss_weights["recon"] = configs.train.loss_weights["recon"] / sum(configs.train.loss_weights.values())
    configs.train.loss_weights["y"] = 1 - configs.train.loss_weights["recon"]
elif configs.model.model_type == "enc-pred":
    configs.train.loss_weights = {"recon": 0, "y": 1}
elif configs.model.model_type == "enc-dec":
    configs.train.loss_weights = {"recon": 1, "y": 0}

if configs.train.use_accelerator:
    accelerator = Accelerator()
    device_ids = list(range(torch.cuda.device_count()))
    device = accelerator.device

    configs.train.update({"accelerator": accelerator})
else:
    if torch.cuda.is_available():
        device_ids = list(range(torch.cuda.device_count()))
        gpu_usages = [np.sum([float(usage.split("uses")[-1].replace(" ","").replace("MB","")) for usage in torch.cuda.list_gpu_processes(id).split("GPU memory") if not usage=="" and "no processes are running" not in usage]) for id in device_ids]
        device_ids = np.argsort(gpu_usages)[:configs.train.n_gpus]
        device_ids = list(map(lambda x: torch.device('cuda', x),list(device_ids)))
        device = device_ids[0] # main device
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')
        device_ids = []

configs.data.update({"root_dir": root_dir,
                        "data_dir": data_dir,
                        "model_dir": model_dir,
                        "result_dir": result_dir})
configs.train.update({"device": device,
                        "device_ids": device_ids,
                        "root_dir": root_dir,
                        "data_dir": data_dir,
                        "model_dir": model_dir,
                        "use_keywords": configs.data.use_keywords,
                        "early_stop_patience": int(0.3*configs.train.max_epochs)})
configs.model.update({"device": device,
                        "device_ids": device_ids,
                        "n_directions": 2 if configs.model.bidirec else 1,
                        "use_accelerator": configs.train.use_accelerator})

## Set hyperparameters for model training (To be TUNED)
if configs.train.do_train and configs.train.do_tune:
    n_layers = configs.model.n_layers = None
    d_embedding = configs.model.d_embedding = None
    d_hidden = configs.model.d_hidden = None
    d_latent = None
    learning_rate = configs.train.learning_rate = None
    batch_size = configs.train.batch_size = None
    config_name = "HPARAM_TUNING"
    final_model_path = None
else:
    n_layers = configs.model.n_layers
    d_embedding = configs.model.d_embedding
    d_hidden = configs.model.d_hidden
    d_latent = 64

    key_components = {"data": ["target_ipc", "pred_type", "max_seq_len", "vocab_size"], "model": ["n_layers", "d_hidden", "d_embedding", "d_ff", "n_head", "d_head", "take_last_h"], "train": ["learning_rate", "batch_size", "max_epochs"]}
    config_name = ""
    for key in key_components.keys():
        for component in key_components[key]:
            config_name += "["+str(configs[key][component])+component+"]"
    final_model_path = os.path.join(model_dir, f"[Final_model]{config_name}.ckpt")

configs.model.update({"d_latent": d_latent})
configs.train.update({"config_name": config_name,
                        "final_model_path": final_model_path})


# PART 2: Dataset setting

In [3]:
tstart = time.time()
org_config_keys_temp = copy.copy(org_config_keys["data"])
org_config_keys_temp.pop(org_config_keys_temp.index("use_pretrained_tokenizer"))
org_config_keys_temp.pop(org_config_keys_temp.index("data_file"))
dataset_config_name = "-".join([str(key)+"="+str(value) for (key,value) in configs.data.items() if key in org_config_keys_temp])
dataset_path = os.path.join(data_dir, "pickled_dataset", "[tech_dataset]"+dataset_config_name+".pickle")
if os.path.exists(dataset_path) and args.do_save is False:
    print("Load pickled dataset...")
    with open(dataset_path, "rb") as f:
        tech_dataset = pickle.load(f)   # Load pickled dataset if dataset with same configuration already saved
    print("Pickled dataset loaded")
else:
    print("Make dataset...")
    tech_dataset = TechDataset(configs.data)
    with open(dataset_path, "wb") as f:
        tech_dataset.rawdata = None
        pickle.dump(tech_dataset, f)
tend = time.time()
print(f"{np.round(tend-tstart,4)} sec elapsed for loading patents for class [{configs.data.target_ipc}]")

configs.model.update({"tokenizers": tech_dataset.tokenizers,
                        "n_enc_vocab": tech_dataset.tokenizers["enc"].vocab_size,
                        "n_dec_vocab": tech_dataset.tokenizers["dec"].vocab_size,
                        "n_enc_seq": tech_dataset.max_seq_len,
                        "n_dec_seq": tech_dataset.max_seq_len,
                        "n_outputs": 1 if configs.data.pred_type=="regression" else tech_dataset.n_outputs,
                        "i_padding": tech_dataset.tokenizers["enc"].token_to_id("<PAD>")})

Make dataset...
19.1998 sec elapsed for loading patents for class [H01L]


In [136]:
configs.data.data_file

'collection_[H01L,H10][2017].csv'

In [137]:
rawdata = pd.read_csv(os.path.join(data_dir, configs.data.data_file))

In [140]:
rawdata_dropna = rawdata.dropna(axis=0, subset=['main ipc', 'sub ipc', 'claims'])[['number','main ipc','sub ipc','claims']]

In [144]:
self = configs.data

In [145]:
data = rawdata_dropna[["number"]].copy(deep=True)

assert self.ipc_level in [1,2,3], f"Not implemented for an IPC level {self.ipc_level}"
if self.ipc_level == 1:
    data['main_ipc'] = rawdata_dropna['main ipc'].apply(lambda x: x[:3])
    data['sub_ipc'] = rawdata_dropna['sub ipc'].apply(lambda x: list(np.unique([xx[:3] for xx in x.split(";")])))
elif self.ipc_level == 2:
    data['main_ipc'] = rawdata_dropna['main ipc'].apply(lambda x: x[:4])
    data['sub_ipc'] = rawdata_dropna['sub ipc'].apply(lambda x: list(np.unique([xx[:4] for xx in x.split(";")])))
elif self.ipc_level == 3:
    data['main_ipc'] = rawdata_dropna['main ipc'].apply(lambda x: x)
    data['sub_ipc'] = rawdata_dropna['sub ipc'].apply(lambda x: list(np.unique([xx for xx in x.split(";")])))
data["ipcs"] = data.apply(lambda x: [x["main_ipc"]]+x["sub_ipc"], axis=1)
seq_len = data['sub_ipc'].apply(lambda x: len(x)).max() + 3 # SOS - main ipc - sub ipcs - EOS
self.max_seq_len = seq_len if self.max_seq_len < seq_len else self.max_seq_len

In [147]:
data["claims"] = rawdata_dropna.loc[data.index]["claims"]

In [148]:
data

Unnamed: 0,number,main_ipc,sub_ipc,ipcs,claims
0,9853235,H01L51/52,"[H01L27/32, H05B33/04]","[H01L51/52, H01L27/32, H05B33/04]",1. A display device comprising: a light emitti...
1,9854199,H04N5/76,"[G11B27/00, G11B27/024, G11B27/032, G11B27/034...","[H04N5/76, G11B27/00, G11B27/024, G11B27/032, ...","1. A method for a digital video recorder, comp..."
4,9851599,G02F1/1335,"[G02B5/20, G02F1/1343, G09G3/34, G09G3/36, G09...","[G02F1/1335, G02B5/20, G02F1/1343, G09G3/34, G...",1. A color display device for displaying an n-...
5,9851864,G06F17/21,"[G06F17/30, G06F3/041, G06F3/0481, G06F3/0485,...","[G06F17/21, G06F17/30, G06F3/041, G06F3/0481, ...",1. A method comprising: identifying content to...
6,9852488,A63F9/24,"[A63F13/00, G06F17/00, G06F19/00, G06Q50/34, G...","[A63F9/24, A63F13/00, G06F17/00, G06F19/00, G0...",1. A computer implemented method of managing b...
...,...,...,...,...,...
37425,9537605,H04K3/00,[H04B1/04],"[H04K3/00, H04B1/04]","1. An ultra-wideband, high-power, solid-state ..."
37426,9538636,H05K1/02,"[H05K1/03, H05K1/14, H05K1/18, H05K3/00, H05K3...","[H05K1/02, H05K1/03, H05K1/14, H05K1/18, H05K3...",1. An apparatus comprising: a substrate compri...
37427,9536977,H01L29/66,"[H01L21/332, H01L21/336, H01L21/8238, H01L29/739]","[H01L29/66, H01L21/332, H01L21/336, H01L21/823...",1. A semiconductor device comprising: a precur...
37428,9534772,H01L33/62,"[F21K99/00, F21V19/00, F21V23/00, F21V23/04, F...","[H01L33/62, F21K99/00, F21V19/00, F21V23/00, F...",1. A lighting apparatus comprising: a pluralit...


# PART 3: Training

In [36]:
sampler = CVSampler(tech_dataset, n_folds=configs.train.n_folds, test_ratio=0.1, stratify=True)
cv_idx = sampler.get_idx_dict()
print(f"#Samples\nTrain: {len(cv_idx[0]['train'])}, Validation: {len(cv_idx[0]['val'])}, Test: {len(cv_idx[0]['test'])}")

#Samples
Train: 24389, Validation: 6098, Test: 3388


## PART 3-2: Dataset construction and model training

In [None]:
# import importlib
# import models, train_utils, parallel
# importlib.reload(models)
# importlib.reload(train_utils)
# importlib.reload(parallel)
# from train_utils import build_model
# from models import SEQ2SEQ

In [37]:
## Construct datasets
train_idx = cv_idx[0]['train']
val_idx = cv_idx[0]['val']
test_idx = cv_idx[0]['test']
whole_idx = np.concatenate([train_idx, val_idx])

train_dataset = Subset(tech_dataset, train_idx)
val_dataset = Subset(tech_dataset, val_idx)
test_dataset = Subset(tech_dataset, test_idx)
whole_dataset = Subset(tech_dataset, whole_idx)

train_loader = DataLoader(train_dataset, batch_size=configs.train.batch_size, shuffle=True, num_workers=4, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=configs.train.batch_size if len(val_idx)>configs.train.batch_size else len(val_idx), shuffle=True, num_workers=4, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=configs.train.batch_size if len(test_idx)>configs.train.batch_size else len(test_idx), shuffle=False, num_workers=4)
whole_loader = DataLoader(whole_dataset, batch_size=configs.train.batch_size, shuffle=False, num_workers=4)

## Load best model or build model
final_model = build_model(configs.model, tokenizers=tech_dataset.tokenizers)

In [39]:
model_params=configs.model
from parallel import DataParallelModel, DataParallelCriterion
from utils import loss_KLD, KLDLoss

In [41]:
loss_recon = torch.nn.CrossEntropyLoss(ignore_index=model_params['i_padding'])
# loss_y = torch.nn.MSELoss() if model_params['n_outputs']==1 else torch.nn.CrossEntropyLoss()
loss_y = torch.nn.MSELoss() if model_params['n_outputs']==1 else torch.nn.NLLLoss()
# loss_y = torch.nn.MSELoss() if model_params['n_outputs']==1 else torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)
# loss_y = torch.nn.MSELoss() if model_params['n_outputs']==1 else torch.nn.BCELoss()
loss_kld = KLDLoss()

loss_recon = DataParallelCriterion(loss_recon, device_ids=model_params['device_ids'])
loss_y = DataParallelCriterion(loss_y, device_ids=model_params['device_ids'])
loss_kld = DataParallelCriterion(loss_kld, device_ids=model_params['device_ids'])

In [43]:
loss_kld.module

KLDLoss()

In [None]:
if model_params["model_type"] == "enc-pred-dec":
    loss_f = {"recon": loss_recon, "y": loss_y, "KLD": loss_KLD}
elif model_params["model_type"] == "enc-dec":
    loss_f = {"recon": loss_recon, "KLD": loss_KLD}
elif model_params["model_type"] == "enc-pred":
    loss_f = {"y": loss_y}
else:
    loss_f = None

In [None]:
loss_f

In [44]:
batch_data = next(iter(train_loader))

In [45]:
batch_data = {"text_inputs": to_device(batch_data["text_inputs"], device), "text_outputs": to_device(batch_data["text_outputs"], device), "targets": to_device(batch_data["targets"], device)}

In [46]:
outputs = final_model(batch_data["text_inputs"], batch_data["text_outputs"]) # omit <eos> from target sequence
outputs_recon = [output["dec_outputs"].permute(0,2,1) for output in outputs]
outputs_z = [output["z"] for output in outputs] # outputs_z: n_gpus * (minibatch, d_hidden)
outputs_y = [output["pred_outputs"] for output in outputs] # outputs_y: n_gpus * (minibatch, n_outputs)
outputs_mu = [output["mu"] for output in outputs]
outputs_logvar = [output["logvar"] for output in outputs]
# dict_outputs = {"recon": outputs_recon, "y": outputs_y, "z": outputs_z}
dict_outputs = {"recon": outputs_recon, "y": outputs_y, "z": outputs_z, "mu": outputs_mu, "logvar": outputs_logvar}

In [51]:
loss_kld.module(dict_outputs["mu"][0], dict_outputs["logvar"][0])

tensor(3.1641, device='cuda:0', grad_fn=<MulBackward0>)

In [106]:
inputs = dict_outputs["mu"]
targets = dict_outputs["logvar"]

In [109]:
targets_ = [torch.cat([target.to(device) for target in targets])]

In [93]:
targets[0].shape

torch.Size([4, 128])

In [112]:
targets_.shape

torch.Size([16, 128])

In [122]:
targets, kwargs = loss_kld.scatter([targets_], kwargs=None, device_ids=loss_kld.device_ids)

In [127]:
len(targets[0])

1

In [121]:
if torch.tensor([1.2, 0.3], device=device): print("A")

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [117]:
targets_

tensor([[-1.2333e-01,  5.3464e-02, -4.2547e-02,  ...,  4.6852e-02,
         -7.5298e-02, -1.7926e-02],
        [-5.3685e-02,  7.9206e-02, -7.2477e-02,  ...,  6.3355e-04,
         -9.3319e-02, -1.5820e-02],
        [-1.3475e-01,  9.7913e-02, -2.4637e-02,  ..., -1.4057e-02,
          1.1413e-04, -9.9982e-03],
        ...,
        [-9.2506e-02,  9.0090e-02, -2.3219e-02,  ...,  3.0975e-02,
         -9.5019e-02,  3.2163e-02],
        [-4.9186e-02,  1.5572e-01, -8.0533e-02,  ...,  3.5850e-02,
         -1.1785e-01, -7.3595e-02],
        [-1.5770e-01,  7.0231e-02, -4.7816e-02,  ...,  1.4363e-02,
         -3.7096e-02,  1.3177e-02]], device='cuda:0', grad_fn=<CatBackward0>)

In [114]:
if targets_: print("A")

RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [101]:
inputs[0].shape

torch.Size([4, 128])

In [103]:
batch_data["text_outputs"]

torch.Size([16, 100])

In [100]:
torch.cat(targets[0]).shape

torch.Size([4, 128])

In [52]:
replicas = loss_kld.replicate(loss_kld.module, loss_kld.device_ids)

In [131]:
targets_[0].shape

torch.Size([128])

In [128]:
replicas[0](inputs[0], targets_[0])

tensor(2.9502, device='cuda:0', grad_fn=<MulBackward0>)

In [87]:
inputs[0].shape

torch.Size([4, 128])

In [89]:
targets[0][0]

tensor([[-0.1233,  0.0535, -0.0425, -0.0633, -0.0475, -0.0671,  0.0361, -0.1124,
         -0.0715,  0.0257,  0.0482, -0.1333, -0.0455, -0.0790,  0.0204, -0.1194,
          0.0841,  0.0734, -0.1319, -0.0379, -0.0823, -0.1566, -0.0455,  0.1286,
         -0.0811, -0.0679,  0.2031, -0.0143,  0.1159,  0.0118,  0.0786,  0.0560,
          0.0005, -0.0017,  0.0430, -0.0288,  0.0041, -0.0293, -0.0495, -0.0807,
         -0.0561, -0.1178, -0.0184,  0.0392,  0.0717, -0.0468,  0.0250, -0.1086,
         -0.0104, -0.0262,  0.0104,  0.0147,  0.0989, -0.0621, -0.0231, -0.1607,
          0.0064, -0.0489, -0.1360,  0.0032, -0.0452, -0.0911, -0.0026,  0.1276,
         -0.0055,  0.0242,  0.0226, -0.0165,  0.1711, -0.0196,  0.0282,  0.0876,
         -0.1013,  0.0638, -0.0384, -0.0071, -0.0916, -0.0182, -0.0700, -0.0340,
          0.0382, -0.0007, -0.0089, -0.1037,  0.0080,  0.0901,  0.1261,  0.0390,
         -0.1373, -0.2139, -0.1331,  0.1462, -0.0198, -0.0724, -0.0046, -0.0109,
          0.0306, -0.0428,  

In [None]:
replicas[0]

In [None]:
preds_recon = [output for output in dict_outputs["recon"]]
trues_recon = batch_data["text_outputs"]

In [None]:
loss_f["KLD"]

In [None]:
logvar = dict_outputs["logvar"][0]

In [None]:
mu = dict_outputs["mu"][0]

In [None]:
-0.5 * torch.sum((1 + logvar - mu.pow(2) - logvar.exp()))

In [None]:
i=0
torch.sum(torch.tensor([loss_f["KLD"](mu=dict_outputs["mu"][i], logvar=dict_outputs["logvar"][i]) for i in range(configs.train.n_gpus)]))

In [None]:
raise

In [None]:
loss_KLD

In [None]:
from torch.nn.modules.loss import _Loss

In [None]:
class KLDLoss(_Loss):
    def __init__(self, size_average=None, reduce=None, reduction: str = "mean", log_target: bool = False) -> None:
        super().__init__(size_average, reduce, reduction)
        self.log_target = log_target
        
    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        return loss_KLD(input, target)

In [None]:
loss_f_kld = KLDLoss()

In [None]:
DataParallelCriterion(loss_f_kld)

In [None]:
preds_y = dict_outputs["y"]
trues_y = batch_data["targets"].to(dtype=preds_y[0].dtype) if model_params["n_outputs"]==1 else batch_data["targets"]

In [None]:
loss_f["y"](preds_y, trues_y)

In [None]:
loss_f["recon"](preds_recon, trues_recon)

In [None]:
raise

In [None]:
class_weights = torch.tensor(np.unique(tech_dataset.Y[whole_idx], return_counts=True)[1])
pos_weight = class_weights[0]/class_weights[1]
final_model = train_model(final_model, train_loader, val_loader, configs.model, configs.train, class_weights=pos_weight)

In [None]:
raise

## PART 3-3: Training evaluation

In [None]:
if args.eval_train_set:
    ## Evaluation on train dataset
    print("Validate model on train dataset")
    # trues_recon_train, preds_recon_train, trues_y_train, preds_y_train = validate_model(final_model, whole_loader, configs.model, configs.train)
    val_res_train = validate_model_mp(final_model, whole_dataset, mp=mp, model_params=configs.model, train_params=configs.train)
    trues_recon_train = np.concatenate([res["recon"]["true"] for res in val_res_train.values()])
    preds_recon_train = np.concatenate([res["recon"]["pred"] for res in val_res_train.values()])
    trues_y_train = np.concatenate([res["y"]["true"] for res in val_res_train.values()])
    preds_y_train = np.concatenate([res["y"]["pred"] for res in val_res_train.values()])

    eval_recon_train = perf_eval("TRAIN_SET", trues_recon_train, preds_recon_train, configs=configs, pred_type='generative', tokenizer=final_model.module.tokenizer)
    eval_recon_train = perf_eval("TRAIN_SET", trues_recon_train, preds_recon_train, configs=configs, pred_type='generative', tokenizer=final_model.module.tokenizer)
    eval_y_train = perf_eval("TRAIN_SET", trues_y_train, preds_y_train, configs=configs, pred_type=configs.data.pred_type)
    if configs.data.pred_type == "classification":
        eval_y_train, confmat_y_train = eval_y_train
else:
    eval_recon_train = eval_y_train = None

## Evaluation on test dataset
print("Validate model on test dataset")
# trues_recon_test, preds_recon_test, trues_y_test, preds_y_test = validate_model(final_model, test_loader, configs.model, configs.train)
val_res_test = validate_model_mp(final_model, test_dataset, mp=mp, batch_size=64, model_params=configs.model, train_params=configs.train)
trues_recon_test = np.concatenate([res["recon"]["true"] for res in val_res_test.values()])
preds_recon_test = np.concatenate([res["recon"]["pred"] for res in val_res_test.values()])
trues_y_test = np.concatenate([res["y"]["true"] for res in val_res_test.values()])
preds_y_test = np.concatenate([res["y"]["pred"] for res in val_res_test.values()])

eval_recon_test = perf_eval("TEST_SET", trues_recon_test, preds_recon_test, configs=configs,  pred_type='generative', tokenizer=final_model.module.tokenizer)
eval_y_test = perf_eval("TEST_SET", trues_y_test, preds_y_test, configs=configs, pred_type=configs.data.pred_type)
if configs.data.pred_type == "classification":
    eval_y_test, confmat_y_test = eval_y_test

eval_recon_res = pd.concat([eval_recon_train, eval_recon_test], axis=0)
eval_y_res = pd.concat([eval_y_train, eval_y_test], axis=0)
if configs.data.pred_type == "classification":
    confmat_y_res = pd.concat([confmat_y_train, confmat_y_test], axis=0)


In [None]:
raise

# (temp) Pre-trained model experiment

In [None]:
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)
    
    def forward(self, input_ids, mask):
        x = self.l1(input_ids=input_ids, attention_mask=mask)
        hidden_state = x[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        
        return output, hidden_state

In [None]:
m = BERTClass().to(device)

In [None]:
out = tokenizer(tech_dataset.data['claims'][:100].tolist(), add_special_tokens=True, max_length=256, padding="max_length", truncation=True)

In [None]:
inputs = {"input_ids": torch.tensor(out.input_ids, dtype=torch.long, device=device),
         "mask": torch.tensor(out.attention_mask, dtype=torch.long, device=device)}

In [None]:
temp = m(inputs["input_ids"], inputs["mask"])

In [None]:
temp[1].shape

In [None]:
device = torch.device("cuda", 0)

In [None]:
print(pytorch_model_summary.summary(m.to(device), torch.zeros(inputs["input_ids"].shape, device=device, dtype=torch.long), torch.zeros(inputs["input_ids"].shape, device=device, dtype=torch.long), show_input=True, max_depth=None, show_parent_layers=True))

## Inference

In [None]:
final_model_path

In [None]:
final_model = build_model(configs.model, tokenizer=tech_dataset.tokenizer)
if os.path.exists(final_model_path):
    best_states = torch.load(final_model_path)
else:
    raise Exception("Model need to be trained first")
converted_states = OrderedDict()
for k, v in best_states.items():
    if 'module' not in k:
        k = 'module.'+k
    else:
        k = k.replace('features.module.', 'module.features.')
    converted_states[k] = v
final_model.load_state_dict(converted_states)

In [None]:
# Copy predictor
temp_path = os.path.join(model_dir, "temp", "temp.ckpt")
predictor = Predictor(final_model.module.config).to(final_model.module.device)
torch.save(final_model.module.predictor.state_dict(), temp_path)
predictor.load_state_dict(torch.load(temp_path, map_location=final_model.module.device))

In [None]:
result_path = os.path.join(root_dir, "results")

In [None]:
used_train_data = pd.read_excel(os.path.join(result_path, "[DATASET]2023-04-11_00:39.xlsx"), sheet_name="TRAIN_dataset")
used_test_data = pd.read_excel(os.path.join(result_path, "[DATASET]2023-04-11_00:39.xlsx"), sheet_name="TEST_dataset")
used_train_index = tech_dataset.data.index.get_indexer(pd.Index(used_train_data["number"]))
used_test_index = tech_dataset.data.index.get_indexer(pd.Index(used_test_data["number"]))

In [None]:
tech_dataset.data.iloc[used_test_index]

In [None]:
input_claims = tech_dataset.X[used_test_index]
text_inputs = tech_dataset.tokenizer.encode(input_claims[0])
batch_inf = {"input_ids": torch.tensor([tech_dataset.tokenizer.encode(input_claims[0]).ids]), "attention_mask": torch.tensor(tech_dataset.tokenizer.encode(input_claims[0]).attention_mask)}

In [None]:
input_inf = to_device(batch_inf, final_model.module.device)

In [None]:
def get_tensor_info(tensor):
  info = []
  for name in ['requires_grad', 'is_leaf', 'retains_grad', 'grad_fn', 'grad']:
    info.append(f'{name}({getattr(tensor, name, None)})')
  info.append(f'tensor({str(tensor)})')
  return ' '.join(info)

In [None]:
enc_outputs = final_model.module.encode(input_inf)

In [None]:
pred_outputs = predictor(enc_outputs)

In [None]:
enc_outputs.retain_grad()

In [None]:
pred_outputs[0,1].backward()

In [None]:
enc_outputs.grad

In [None]:
enc_outputs

In [None]:
step_size = 1e-2

In [None]:
enc_outputs_ = enc_outputs + step_size * enc_outputs.grad

In [None]:
new_pred_outputs = predictor(enc_outputs_)

In [None]:
new_pred_outputs

In [None]:
dec_outputs = final_model.module.decode(input_inf, enc_outputs)

In [None]:
final_model.module.tokenizer.decode_batch(dec_outputs.cpu().detach().numpy())

In [None]:
input_claims[0]

In [None]:
inputs.retain_grad()

In [None]:
print(get_tensor_info(inputs))

In [None]:
predictor(inputs)

In [None]:
predictor(inputs)[0,-1,-1].backward()

In [None]:
print(get_tensor_info(inputs))

In [None]:
instant_dataset = Subset(tech_dataset, np.random.choice(np.arange(len(tech_dataset)), 1000))
data_loader = DataLoader(instant_dataset, batch_size=16)

In [None]:
class_weights = np.unique(tech_dataset.Y[whole_idx], return_counts=True)[1]

In [None]:
batch_data = next(iter(data_loader))

In [None]:
text_inputs = to_device(batch_data["text_inputs"], device)

In [None]:
enc_outputs, *_ = final_model.module.encoder(**text_inputs)

In [None]:
enc_outputs.shape

In [None]:
z = enc_outputs[:,-1,:]

In [None]:
preds = final_model.module.predictor(z)

In [None]:
preds.shape

In [None]:
torch.tensor(1).item()

In [None]:
torch.cuda.empty_cache()

In [None]:
preds_recon = []

if str(configs.model["tokenizer"].__class__).split("\'")[1].split(".")[0] == "transformers":
    preds_recon_batch = torch.tile(torch.tensor(configs.model['tokenizer'].convert_tokens_to_ids("<SOS>"), device=device), dims=(text_inputs["input_ids"].shape[0],1)).to(device=device)
elif str(configs.model["tokenizer"].__class__).split("\'")[1].split(".")[0] == "tokenizers":
    preds_recon_batch = torch.tile(torch.tensor(configs.model['tokenizer'].token_to_id("<SOS>"), device=device), dims=(text_inputs["input_ids"].shape[0],1)).to(device=device)
    
for i in range(configs.model['n_dec_seq']-1):
    dec_outputs, *_ = final_model.module.decoder(preds_recon_batch, text_inputs["input_ids"], enc_outputs)
    pred_tokens = dec_outputs.argmax(2)[:,-1].unsqueeze(1)
    preds_recon_batch = torch.cat([preds_recon_batch, pred_tokens], axis=1)
    torch.cuda.empty_cache()
preds_recon.append(preds_recon_batch[:,1:].cpu().detach().numpy())
preds_recon = np.concatenate(preds_recon)

In [None]:
display("TRUE(keywords): "+pd.Series(configs.model.tokenizer.decode_batch(batch_data["text_inputs"]["input_ids"].cpu().detach().numpy()))[0])

display("TRUE(org): "+pd.Series(configs.model.tokenizer.decode_batch(batch_data["text_outputs"]["input_ids"].cpu().detach().numpy()))[0])

display("PRED: "+pd.Series(configs.model.tokenizer.decode_batch(preds_recon, skip_special_tokens=False)).apply(lambda x: x.split("<EOS>")[0])[0])

In [None]:
enc_outputs_ = torch.normal(enc_outputs, torch.tile(torch.tensor(2), dims=enc_outputs.size()).to(device))

In [None]:
preds_recon_ = []

if str(configs.model["tokenizer"].__class__).split("\'")[1].split(".")[0] == "transformers":
    preds_recon_batch = torch.tile(torch.tensor(configs.model['tokenizer'].convert_tokens_to_ids("<SOS>"), device=device), dims=(text_inputs["input_ids"].shape[0],1)).to(device=device)
elif str(configs.model["tokenizer"].__class__).split("\'")[1].split(".")[0] == "tokenizers":
    preds_recon_batch = torch.tile(torch.tensor(configs.model['tokenizer'].token_to_id("<SOS>"), device=device), dims=(text_inputs["input_ids"].shape[0],1)).to(device=device)
    
for i in range(configs.model['n_dec_seq']-1):
    dec_outputs, *_ = final_model.module.decoder(preds_recon_batch, text_inputs["input_ids"], enc_outputs_)
    pred_tokens = dec_outputs.argmax(2)[:,-1].unsqueeze(1)
    preds_recon_batch = torch.cat([preds_recon_batch, pred_tokens], axis=1)
    torch.cuda.empty_cache()
preds_recon_.append(preds_recon_batch[:,1:].cpu().detach().numpy())
preds_recon_ = np.concatenate(preds_recon_)

In [None]:
i = 10

print("TRUE(keywords): "+pd.Series(configs.model.tokenizer.decode_batch(batch_data["text_inputs"]["input_ids"].cpu().detach().numpy()))[i])

print("TRUE(org): "+pd.Series(configs.model.tokenizer.decode_batch(batch_data["text_outputs"]["input_ids"].cpu().detach().numpy()))[i])

print("PRED: "+pd.Series(configs.model.tokenizer.decode_batch(preds_recon, skip_special_tokens=False)).apply(lambda x: x.split("<EOS>")[0])[i])

print("PRED(modified): "+pd.Series(configs.model.tokenizer.decode_batch(preds_recon_, skip_special_tokens=False)).apply(lambda x: x.split("<EOS>")[0])[i])