## Libraries

In [1]:
# utils
import os
import random
import warnings
import re
import jieba
import math
import string as s
import sys
import logging
import inspect
import argparse
!pip install coloredlogs
import coloredlogs

# numpy
import numpy as np

# pytorch
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import TensorDataset, Dataset
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.autograd import Variable
from torch.optim.optimizer import Optimizer

# transformers
import transformers
from transformers import DistilBertPreTrainedModel
from transformers import BertModel, BertTokenizer, DistilBertTokenizer, DistilBertModel
from transformers import BertPreTrainedModel
from transformers.models.bert.configuration_bert import BertConfig
from transformers.models.distilbert.configuration_distilbert import DistilBertConfig

# sklearn
import sklearn.metrics as sk_metrics

# type hint
from typing import List, Optional, Union

# progress bar
from tqdm.notebook import tqdm, trange
from functools import partial
tqdm = partial(tqdm, position=0, leave=True) # tqdm for colab cell

Collecting coloredlogs
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting humanfriendly>=9.1 (from coloredlogs)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: humanfriendly, coloredlogs
Successfully installed coloredlogs-15.0.1 humanfriendly-10.0


In [2]:
"""
Python == 3.10.12
Pytorch == 2.1.0+cu121
Transformers == 4.35.2
"""
!python --version
print(torch.__version__)
print(transformers.__version__)

Python 3.10.12
2.1.0+cu121
4.37.2


## Modeling

Original Paper References -

A Co-Interactive Transformer for Joint Slot Filling and Intent Detection by Qin et al., 2020 -
https://doi.org/10.48550/arXiv.2010.03880

<br>

Conditional random fields: Probabilistic models for segmenting and labeling sequence data by John et al., - 2001

<br>

Huggingface for pre-trained bert -  
https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py

https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/distilbert/modeling_distilbert.py

#### Layer Norm

In [3]:
class LayerNorm(nn.Module):
  '''
  REFERENCES -
  # https://github.com/CyberZHG/torch-layer-normalization/blob/89f405b60f53f85da6f03fe685c190ef394ce50c/torch_layer_normalization/layer_normalization.py#L8
  # https://www.ncl.ac.uk/webtemplate/ask-assets/external/maths-resources/statistics/descriptive-statistics/variance-and-standard-deviation.html#:~:text=Given%20a%20discrete%20random%20variable,x%202%20%7D%20%E2%88%92%20%CE%BC%202%20.
  '''
  def __init__(self, hidden_size, eps=1e-12):
    super(LayerNorm, self).__init__()

    self.weight = nn.Parameter(torch.ones(hidden_size))
    self.bias = nn.Parameter(torch.zeros(hidden_size))
    self.variance_epsilon = eps

  def forward(self, x):
    u = x.mean(-1, keepdim=True) # mean
    s = (x - u).pow(2).mean(-1, keepdim=True) # variance
    x = (x - u) / torch.sqrt(s + self.variance_epsilon) # x_norm
    return self.weight * x + self.bias

#### Conditional Random Field

In [4]:
class CRF(nn.Module):
    """Conditional random field.

    This module implements a conditional random field [LMP]. The forward computation
    of this class computes the log likelihood of the given sequence of tags and
    emission score tensor. This class also has ``decode`` method which finds the
    best tag sequence given an emission score tensor using `Viterbi algorithm`_.

    Arguments
    ---------
    num_tags : int
        Number of tags.

    Attributes
    ----------
    num_tags : int
        Number of tags passed to ``__init__``.
    start_transitions : :class:`~torch.nn.Parameter`
        Start transition score tensor of size ``(num_tags,)``.
    end_transitions : :class:`~torch.nn.Parameter`
        End transition score tensor of size ``(num_tags,)``.
    transitions : :class:`~torch.nn.Parameter`
        Transition score tensor of size ``(num_tags, num_tags)``.

    References
    ----------
    .. [LMP] Lafferty, J., McCallum, A., Pereira, F. (2001).
             "Conditional random fields: Probabilistic models for segmenting and
             labeling sequence data". *Proc. 18th International Conf. on Machine
             Learning*. Morgan Kaufmann. pp. 282–289.

    .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
    """
    def __init__(self, num_tags: int) -> None:
        if num_tags <= 0:
            raise ValueError(f'invalid number of tags: {num_tags}')
        super().__init__()
        self.num_tags = num_tags
        self.start_transitions = nn.Parameter(torch.Tensor(num_tags))
        self.end_transitions = nn.Parameter(torch.Tensor(num_tags))
        self.transitions = nn.Parameter(torch.Tensor(num_tags, num_tags))

        self.reset_parameters()

    def reset_parameters(self) -> None:
        """Initialize the transition parameters.

        The parameters will be initialized randomly from a uniform distribution
        between -0.1 and 0.1.
        """
        # nn.init.uniform(self.start_transitions, -0.1, 0.1)
        # nn.init.uniform(self.end_transitions, -0.1, 0.1)
        # nn.init.uniform(self.transitions, -0.1, 0.1)
        nn.init.uniform_(self.start_transitions, -0.1, 0.1) #changes
        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
        nn.init.uniform_(self.transitions, -0.1, 0.1)

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}(num_tags={self.num_tags})'

    def forward(self,
                emissions: Variable,
                tags: Variable,
                mask: Optional[Variable] = None,
                reduce: bool = True,
                ) -> Variable:
        """Compute the log likelihood of the given sequence of tags and emission score.

        Arguments
        ---------
        emissions : :class:`~torch.autograd.Variable`
            Emission score tensor of size ``(seq_length, batch_size, num_tags)``.
        tags : :class:`~torch.autograd.Variable`
            Sequence of tags as ``LongTensor`` of size ``(seq_length, batch_size)``.
        mask : :class:`~torch.autograd.Variable`, optional
            Mask tensor as ``ByteTensor`` of size ``(seq_length, batch_size)``.
        reduce : bool
            Whether to sum the log likelihood over the batch.

        Returns
        -------
        :class:`~torch.autograd.Variable`
            The log likelihood. This will have size (1,) if ``reduce=True``, ``(batch_size,)``
            otherwise.
        """
        if emissions.dim() != 3:
            raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
        if tags.dim() != 2:
            raise ValueError(f'tags must have dimension of 2, got {tags.dim()}')
        if emissions.size()[:2] != tags.size():
            raise ValueError(
                'the first two dimensions of emissions and tags must match, '
                f'got {tuple(emissions.size()[:2])} and {tuple(tags.size())}'
            )
        if emissions.size(2) != self.num_tags:
            raise ValueError(
                f'expected last dimension of emissions is {self.num_tags}, '
                f'got {emissions.size(2)}'
            )
        if mask is not None:
            if tags.size() != mask.size():
                raise ValueError(
                    f'size of tags and mask must match, got {tuple(tags.size())} '
                    f'and {tuple(mask.size())}'
                )
            if not all(mask[0].data):
                raise ValueError('mask of the first timestep must all be on')

        if mask is None:
            mask = Variable(self._new(tags.size()).fill_(1)).byte()

        numerator = self._compute_joint_llh(emissions, tags, mask)
        denominator = self._compute_log_partition_function(emissions, mask)
        llh = numerator - denominator
        return llh if not reduce else torch.sum(llh) # negative log-likelihood

    def decode(self,
               emissions: Union[Variable, torch.FloatTensor],
               mask: Optional[Union[Variable, torch.ByteTensor]] = None) -> List[List[int]]:
        """Find the most likely tag sequence using Viterbi algorithm.

        Arguments
        ---------
        emissions : :class:`~torch.autograd.Variable` or :class:`~torch.FloatTensor`
            Emission score tensor of size ``(seq_length, batch_size, num_tags)``.
        mask : :class:`~torch.autograd.Variable` or :class:`torch.ByteTensor`
            Mask tensor of size ``(seq_length, batch_size)``.

        Returns
        -------
        list
            List of list containing the best tag sequence for each batch.
        """
        if emissions.dim() != 3:
            raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
        if emissions.size(2) != self.num_tags:
            raise ValueError(
                f'expected last dimension of emissions is {self.num_tags}, '
                f'got {emissions.size(2)}'
            )
        if mask is not None and emissions.size()[:2] != mask.size():
            raise ValueError(
                'the first two dimensions of emissions and mask must match, '
                f'got {tuple(emissions.size()[:2])} and {tuple(mask.size())}'
            )

        if isinstance(emissions, Variable):
            emissions = emissions.data
        if mask is None:
            mask = self._new(emissions.size()[:2]).fill_(1).byte()
        elif isinstance(mask, Variable):
            mask = mask.data
        tt = self.transitions.detach().cpu().numpy()
        # Transpose batch_size and seq_length
        emissions = emissions.transpose(0, 1)
        mask = mask.transpose(0, 1)

        best_tags = []
        for emission, mask_ in zip(emissions, mask):
            seq_length = mask_.long().sum()
            best_tags.append(self._viterbi_decode(emission[:seq_length]))
        return best_tags

    def _compute_joint_llh(self,
                           emissions: Variable,
                           tags: Variable,
                           mask: Variable) -> Variable:
        # emissions: (seq_length, batch_size, num_tags)
        # tags: (seq_length, batch_size)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and tags.dim() == 2
        assert emissions.size()[:2] == tags.size()
        assert emissions.size(2) == self.num_tags
        assert mask.size() == tags.size()
        assert all(mask[0].data)

        seq_length = emissions.size(0)
        mask = mask.float()

        # Start transition score
        llh = self.start_transitions[tags[0]]  # (batch_size,)

        # https://pytorch.org/docs/stable/generated/torch.gather.html
        for i in range(seq_length - 1):
            cur_tag, next_tag = tags[i], tags[i+1]

            # Emission score for current tag
            llh += emissions[i].gather(1, cur_tag.view(-1, 1)).squeeze(1) * mask[i]

            # Transition score to next tag
            transition_score = self.transitions[cur_tag, next_tag]
            # Only add transition score if the next tag is not masked (mask == 1)
            llh += transition_score * mask[i+1]

        # Find last tag index
        last_tag_indices = mask.long().sum(0) - 1  # (batch_size,)
        last_tags = tags.gather(0, last_tag_indices.view(1, -1)).squeeze(0)

        # End transition score
        llh += self.end_transitions[last_tags]
        # Emission score for the last tag, if mask is valid (mask == 1)
        llh += emissions[-1].gather(1, last_tags.view(-1, 1)).squeeze(1) * mask[-1]

        return llh

    def _compute_log_partition_function(self,
                                        emissions: Variable,
                                        mask: Variable) -> Variable:
        # emissions: (seq_length, batch_size, num_tags)
        # mask: (seq_length, batch_size)
        assert emissions.dim() == 3 and mask.dim() == 2
        assert emissions.size()[:2] == mask.size()
        assert emissions.size(2) == self.num_tags
        assert all(mask[0].data)

        seq_length = emissions.size(0)
        mask = mask.float()

        # Start transition score and first emission
        log_prob = self.start_transitions.view(1, -1) + emissions[0]
        # Here, log_prob has size (batch_size, num_tags) where for each batch,
        # the j-th column stores the log probability that the current timestep has tag j
        for i in range(1, seq_length):
            # Broadcast log_prob over all possible next tags
            broadcast_log_prob = log_prob.unsqueeze(2)  # (batch_size, num_tags, 1)
            # Broadcast transition score over all instances in the batch
            broadcast_transitions = self.transitions.unsqueeze(0)  # (1, num_tags, num_tags)
            # Broadcast emission score over all possible current tags
            broadcast_emissions = emissions[i].unsqueeze(1)  # (batch_size, 1, num_tags)
            # Sum current log probability, transition, and emission scores
            score = broadcast_log_prob + broadcast_transitions \
                + broadcast_emissions  # (batch_size, num_tags, num_tags)
            # Sum over all possible current tags, but we're in log prob space, so a sum
            # becomes a log-sum-exp
            score = self._log_sum_exp(score, 1)  # (batch_size, num_tags)
            # Set log_prob to the score if this timestep is valid (mask == 1), otherwise
            # leave it alone
            log_prob = score * mask[i].unsqueeze(1) + log_prob * (1.-mask[i]).unsqueeze(1)

        # End transition score
        log_prob += self.end_transitions.view(1, -1)
        # Sum (log-sum-exp) over all possible tags
        return self._log_sum_exp(log_prob, 1)  # (batch_size,)

    def _viterbi_decode(self, emission: torch.FloatTensor) -> List[int]:
        # emission: (seq_length, num_tags)
        assert emission.size(1) == self.num_tags

        seq_length = emission.size(0)

        # Start transition
        viterbi_score = self.start_transitions.data + emission[0]
        viterbi_path = []
        # Here, viterbi_score has shape of (num_tags,) where value at index i stores
        # the score of the best tag sequence so far that ends with tag i
        # viterbi_path saves where the best tags candidate transitioned from; this is used
        # when we trace back the best tag sequence

        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
        # for every possible next tag
        for i in range(1, seq_length):
            # Broadcast viterbi score for every possible next tag
            broadcast_score = viterbi_score.view(-1, 1)
            # Broadcast emission score for every possible current tag
            broadcast_emission = emission[i].view(1, -1)
            # Compute the score matrix of shape (num_tags, num_tags) where each entry at
            # row i and column j stores the score of transitioning from tag i to tag j
            # and emitting
            score = broadcast_score + self.transitions.data + broadcast_emission
            # Find the maximum score over all possible current tag
            best_score, best_path = score.max(0)  # (num_tags,)
            # Save the score and the path
            viterbi_score = best_score
            viterbi_path.append(best_path)

        # End transition
        viterbi_score += self.end_transitions.data

        # Find the tag which maximizes the score at the last timestep; this is our best tag
        # for the last timestep
        _, best_last_tag = viterbi_score.max(0)
        best_tags = [best_last_tag]

        # We trace back where the best last tag comes from, append that to our best tag
        # sequence, and trace it back again, and so on
        for path in reversed(viterbi_path):
            best_last_tag = path[best_tags[-1]]
            best_tags.append(best_last_tag)

        # Reverse the order because we start from the last timestep
        best_tags.reverse()
        return best_tags

    @staticmethod
    def _log_sum_exp(tensor: Variable, dim: int) -> Variable:
        # Find the max value along `dim`
        offset, _ = tensor.max(dim)
        # Make offset broadcastable
        broadcast_offset = offset.unsqueeze(dim)
        # Perform log-sum-exp safely
        safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor - broadcast_offset), dim))
        # Add offset back
        return offset + safe_log_sum_exp

    def _new(self, *args, **kwargs) -> torch.FloatTensor:
        param = next(self.parameters())
        return param.data.new(*args, **kwargs)


#### Encoder Layers

###### Bert Models

In [5]:
"""
This model uses the pre-trained distilbert model to extract word embedding.
Then apply a linear transformation and dropout on the last hidden state
from the pre-trained distilbert.
"""

class DistilBertForFeatureExtraction(nn.Module):
  def __init__(self, distilbert_model, dropout_p, emb_dim):
    super().__init__()
    self.model = distilbert_model
    for param in self.model.parameters():
      param.requires_grad = False
    self.dropout = nn.Dropout(dropout_p)
    self.out = nn.Linear(emb_dim, emb_dim)

  def forward(self, input_ids, attention_masks):
    out = self.model(input_ids, attention_mask=attention_masks)
    last_hidden_state = out['last_hidden_state'] # [batch_size, seq len, emb dim]
    last_hidden_state = self.out(self.dropout(last_hidden_state))

    return last_hidden_state



In [6]:
class BertForFeatureExtraction(nn.Module):
  def __init__(self, bert_model, dropout_p, emb_dim):
    super().__init__()

    self.model = bert_model
    for param in self.model.parameters():
      param.requires_grad = False
    self.dropout = nn.Dropout(dropout_p)
    self.out = nn.Linear(emb_dim, emb_dim)

  def forward(self, input_ids, attention_masks):
    out = self.model(input_ids, attention_mask=attention_masks)
    last_hidden_state = out['last_hidden_state'] # [batch_size, seq len, emb dim]
    last_hidden_state = self.out(self.bert_dropout(last_hidden_state))

    return last_hidden_state

######  Recurrent Layer

In [7]:
class RecurNetLayer(nn.Module):
  def __init__(self, input_size, hidden_size, max_length, num_layers=1, bias=True, batch_first=True, dropout=0,
                 bidirectional=False, only_use_last_hidden_state=False, rnn_type = 'LSTM'):
    """
    LSTM which can hold variable length sequence, use like TensorFlow's RNN(input, length...).

    :param input_size:The number of expected features in the input x
    :param hidden_size:The number of features in the hidden state h
    :param max_length: Maximum sequence length (required for batching)
    :param num_layers:Number of recurrent layers.
    :param bias:If False, then the layer does not use bias weights b_ih and b_hh. Default: True
    :param batch_first:If True, then the input and output tensors are provided as (batch, seq, feature)
    :param dropout:If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
    :param bidirectional:If True, becomes a bidirectional RNN. Default: False
    :param rnn_type: {LSTM, GRU, RNN}
    """
    super(RecurNetLayer, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.max_length = max_length
    self.num_layers = num_layers
    self.bias = bias
    self.batch_first = batch_first
    self.dropout = dropout
    self.bidirectional = bidirectional
    self.only_use_last_hidden_state = only_use_last_hidden_state
    self.rnn_type = rnn_type

    if self.rnn_type == 'LSTM':
        self.RNN = nn.LSTM(
            input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
            bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
    elif self.rnn_type == 'GRU':
        self.RNN = nn.GRU(
            input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
            bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)
    elif self.rnn_type == 'RNN':
        self.RNN = nn.RNN(
            input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
            bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional)

  def forward(self, x, x_len):
    """
    :param x: sequence embedding vectors
    :param x_len: numpy/tensor list of sequence length in each batch
    : return out: features of sequences separated by batch
    : return (ht, ct): last hidden state and last cell state
    REFERENCES -
    https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html
    https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
    https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
    """
    # sort batch by length
    x_sort_idx = torch.sort(-x_len)[1].long() # descending order
    x_unsort_idx = torch.sort(x_sort_idx)[1].long()
    x_len = x_len[x_sort_idx]
    x = x[x_sort_idx]

    # pack
    x_len_cpu = x_len.cpu()
    x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len_cpu, batch_first=self.batch_first)

    if self.rnn_type == 'LSTM':
      out_pack, (ht, ct) = self.RNN(x_emb_p, None)
    else:
      out_pack, ht = self.RNN(x_emb_p, None)
      ct = None

    # unsort - h
    ht = torch.transpose(ht, 0, 1)[x_unsort_idx]
    ht = torch.transpose(ht, 0, 1)

    if self.only_use_last_hidden_state:
      return ht
    else:
      # unpack - out
      out = torch.nn.utils.rnn.pad_packed_sequence(out_pack, batch_first=self.batch_first, total_length = self.max_length) # (batch [features within each batch], seq len, num features)

      out = out[0]
      out = out[x_unsort_idx]
      # unsort - out / c
      if self.rnn_type =='LSTM':
          ct = torch.transpose(ct, 0, 1)[x_unsort_idx]
          ct = torch.transpose(ct, 0, 1)

      return out, (ht, ct) # ht - last hidden state, ct - last memory cell

#### Co-Interactive Layer

In [8]:
class CILayer(nn.Module):
  def __init__(self, intent_fc, slot_fc, hidden_dim, attention_dropout, use_gpu):
    super(CILayer, self).__init__()
    self.I_S_Emb = Label_Attention(intent_fc, slot_fc)
    self.ci_attention = CI_Attention(intent_fc, slot_fc, hidden_dim, attention_dropout, use_gpu)

  def forward(self, H, mask, N = 2):
    H_I_P = H
    H_S_P = H
    H_I_N = H
    H_S_N = H
    for i in range(N):
      H_I_N, H_S_N = self.I_S_Emb(H_I_P, H_S_P, mask) # label attention
      H_I_N, H_S_N = self.ci_attention(H_I_P + H_I_N, H_S_P + H_S_N, mask) # co interactive + feed forward - H + H_I = H + AW^v  representing Hi (explicit intent representation)

      H_I_P = H_I_N
      H_S_P = H_S_N

    return H_I_N, H_S_N


class Label_Attention(nn.Module):
  '''
  Inspired by: "Hierarchically-Refined Label Attention Network For Sequence Labelling" by Leyang Cui & Yue Zhang

  This model contain weights produced by fc layer with correspond intents & slots tags size as output to represent contextual
  features/label embedding (tunable weight) for each intent & slot tag's label. Let v be membership of Intent or Slot label.

  The idea is to utilize the concept of self-attention where Q, K, V which conduct dot product with it's respective weights
  here the label embedding act as the weight, then the input treated as the Q, and the label embedding as K and V. So the attention
  notation would be A = softmax(HW^v). Then dot product will be conducted on A with W
  resulting AW^v producing the output for this function

  NOTE. The output will be summed with H, in this case, Bi-LSTM's output to obtain the final label representation with
  semantic information (Hv)
  '''
  def __init__(self, intent_emb, slot_emb):
    super(Label_Attention, self).__init__()

    self.W_intent_emb = intent_emb.weight
    self.W_slot_emb = slot_emb.weight

  def forward(self, input_intent, input_slot, mask):
    '''
    ATTENTION MECHANISM
    Q = Wq . x
    K = Wk . x
    V = Wv . v
    '''
    intent_score = torch.matmul(input_intent, self.W_intent_emb.t()) # HW^v
    slot_score = torch.matmul(input_slot, self.W_slot_emb.t()) # .t() = transpose
    intent_probs = nn.Softmax(dim=-1)(intent_score) # A
    slot_probs = nn.Softmax(dim=-1)(slot_score)
    intent_res = torch.matmul(intent_probs, self.W_intent_emb) # AW^v
    slot_res = torch.matmul(slot_probs, self.W_slot_emb)

    return intent_res, slot_res


class CI_Attention(nn.Module): # I_S_Block
  '''
  References to "A CO-INTERACTIVE TRANSFORMER FOR JOINT SLOT FILLING AND INTENT DETECTION" by
  Libo Qin, Tailu Liu, Wanxiang Che, Bingbing Kang, Sendong Zhao, and Ting Lio

  1. Compute co-interactive self-attention (joint aware)
  2. Perform add & norm on computed self-attention
  3. Perform feed forward
  '''
  def __init__(self, intent_emb, slot_emb, hidden_size, attention_dropout, use_gpu):
    super(CI_Attention, self).__init__()

    self.I_S_Attention = I_S_SelfAttention(hidden_size, 2 * hidden_size, hidden_size, attention_dropout)
    self.I_Out = TaskCI_Output(hidden_size, attention_dropout)
    self.S_Out = TaskCI_Output(hidden_size, attention_dropout)
    self.I_S_Feed_forward = FFL_I_S(hidden_size, hidden_size, attention_dropout, use_gpu)

  def forward(self, H_intent_input, H_slot_input, mask):
    H_slot, H_intent = self.I_S_Attention(H_intent_input, H_slot_input, mask)
    H_intent = self.I_Out(H_intent, H_intent_input)
    H_slot = self.S_Out(H_slot, H_slot_input)
    H_intent, H_slot = self.I_S_Feed_forward(H_intent, H_slot)

    return H_intent, H_slot


class FFL_I_S(nn.Module):
  def __init__(self, ffl_size, hidden_size, attention_dropout, use_gpu):
    super(FFL_I_S, self).__init__()

    self.dense_in = nn.Linear(hidden_size * 6, ffl_size)
    self.ffl_act_fn = nn.ReLU()
    self.dense_out = nn.Linear(ffl_size, hidden_size)
    self.LayerNorm_I = LayerNorm(hidden_size, eps=1e-12)
    self.LayerNorm_S = LayerNorm(hidden_size, eps=1e-12)
    self.dropout = nn.Dropout(attention_dropout)
    self.use_gpu = use_gpu

  def forward(self, hidden_states_I, hidden_states_S):
    hidden_states_in = torch.cat([hidden_states_I, hidden_states_S], dim=2)
    batch_size, max_length, hidden_size = hidden_states_in.size()
    h_pad = torch.zeros(batch_size, 1, hidden_size)
    if self.use_gpu and torch.cuda.is_available():
        h_pad = h_pad.cuda()
    h_left = torch.cat([h_pad, hidden_states_in[:, :max_length - 1, :]], dim=1)
    h_right = torch.cat([hidden_states_in[:, 1:, :], h_pad], dim=1)
    hidden_states_in = torch.cat([hidden_states_in, h_left, h_right], dim=2)

    hidden_states = self.dense_in(hidden_states_in)
    hidden_states = self.ffl_act_fn(hidden_states)
    hidden_states = self.dense_out(hidden_states)
    hidden_states = self.dropout(hidden_states)
    hidden_states_I_NEW = self.LayerNorm_I(hidden_states + hidden_states_I)
    hidden_states_S_NEW = self.LayerNorm_S(hidden_states + hidden_states_S)
    return hidden_states_I_NEW, hidden_states_S_NEW


class TaskCI_Output(nn.Module):
  '''
  Perform ADD & NORM
  '''
  def __init__(self, hidden_size, hidden_dropout_prob):
    super(TaskCI_Output, self).__init__()

    self.dense = nn.Linear(hidden_size, hidden_size)
    self.LayerNorm = LayerNorm(hidden_size, eps=1e-12)
    self.dropout = nn.Dropout(hidden_dropout_prob)

  def forward(self, hidden_states, input_tensor):
    hidden_states = self.dense(hidden_states)
    hidden_states = self.dropout(hidden_states)
    hidden_states = self.LayerNorm(hidden_states + input_tensor)
    return hidden_states


class I_S_SelfAttention(nn.Module):
  '''
  References to "A CO-INTERACTIVE TRANSFORMER FOR JOINT SLOT FILLING AND INTENT DETECTION" by
  Libo Qin, Tailu Liu, Wanxiang Che, Bingbing Kang, Sendong Zhao, and Ting Lio

  For Each Q, K, and V is a linear layer used to compute respective task-aware (intent, slot filling) representation

  Let v be membership of Intent or Slot label attention (Hs, Hi).

  The mechanism used here is similar to self-attention, but with slight different. First to compute
  joint task-aware representation, the corresponding task information must be incorporated. For instace
  for computing "intent-aware slot representation", The intent representation (label attention) from input serve as Q,
  then the slot representation (label attention) from input serve as K and V. Vice Versa for slot-aware intent representation

  Note. the end result produce a context representation, known by author as Cv (CI - Context Intent or CS - Context Slot)
  in paper. Add & Norm is to be performed on the result, then Fed into Feed Forward Layer
  '''
  def __init__(self, input_size, hidden_size, out_size, attention_dropout):
    super(I_S_SelfAttention, self).__init__()

    self.num_attention_heads = 8
    self.attention_head_size = int(hidden_size / self.num_attention_heads)

    self.all_head_size = self.num_attention_heads * self.attention_head_size
    self.out_size = out_size
    self.query = nn.Linear(input_size, self.all_head_size)
    self.query_slot = nn.Linear(input_size, self.all_head_size)
    self.key = nn.Linear(input_size, self.all_head_size)
    self.key_slot = nn.Linear(input_size, self.all_head_size)
    self.value = nn.Linear(input_size, self.out_size)
    self.value_slot = nn.Linear(input_size, self.out_size)
    self.dropout = nn.Dropout(attention_dropout)

  def transpose_for_scores(self, x):
    last_dim = int(x.size()[-1] / self.num_attention_heads)
    new_x_shape = x.size()[:-1] + (self.num_attention_heads, last_dim)
    x = x.view(*new_x_shape)
    return x.permute(0, 2, 1, 3)

  def forward(self, intent, slot, mask):
    extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)

    extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
    attention_mask = (1.0 - extended_attention_mask) * -10000.0 # did not use -inf because "0.0 * -inf is not 0.0"

    # Slot-Aware Intent Representation
    mixed_query_layer = self.query(intent)
    mixed_key_layer = self.key(slot)
    mixed_value_layer = self.value(slot)

    # Intent-Aware Slot Representation
    mixed_query_layer_slot = self.query_slot(slot)
    mixed_key_layer_slot = self.key_slot(intent)
    mixed_value_layer_slot = self.value_slot(intent)

    # Transpose (T block in paper)
    query_layer = self.transpose_for_scores(mixed_query_layer)
    query_layer_slot = self.transpose_for_scores(mixed_query_layer_slot)
    key_layer = self.transpose_for_scores(mixed_key_layer)
    key_layer_slot = self.transpose_for_scores(mixed_key_layer_slot)
    value_layer = self.transpose_for_scores(mixed_value_layer)
    value_layer_slot = self.transpose_for_scores(mixed_value_layer_slot)

    '''
    REFERENCE -
    "Attention Is All You Need" by Yoon Kim 2017
    '''
    # Scaled Dot-Product Attention (Internal part for softmax)
    attention_scores_intent = torch.matmul(query_layer, key_layer.transpose(-1, -2)) # Q dotproduct K
    attention_scores_intent = attention_scores_intent / math.sqrt(self.attention_head_size) # sqrt dk

    attention_scores_slot = torch.matmul(query_layer_slot, key_layer_slot.transpose(-1, -2))
    attention_scores_slot = attention_scores_slot / math.sqrt(self.attention_head_size)

    # Masking Pad with very large negative value
    attention_scores_intent = attention_scores_intent + attention_mask
    attention_scores_slot = attention_scores_slot + attention_mask

    # Softmax
    attention_probs_slot = nn.Softmax(dim=-1)(attention_scores_slot)
    attention_probs_intent = nn.Softmax(dim=-1)(attention_scores_intent)

    # Dropout
    attention_probs_slot = self.dropout(attention_probs_slot)
    attention_probs_intent = self.dropout(attention_probs_intent)

    # Scaled Dot-Product Attention (External part softmaxed dot product value(v))
    context_layer_slot = torch.matmul(attention_probs_slot, value_layer_slot)
    context_layer_intent = torch.matmul(attention_probs_intent, value_layer)

    context_layer = context_layer_slot.permute(0, 2, 1, 3).contiguous() # change order of tensor dim, e.g 3rd col to 2nd col
    context_layer_intent = context_layer_intent.permute(0, 2, 1, 3).contiguous()
    new_context_layer_shape = context_layer.size()[:-2] + (self.out_size,)
    new_context_layer_shape_intent = context_layer_intent.size()[:-2] + (self.out_size,)

    context_layer = context_layer.view(*new_context_layer_shape)
    context_layer_intent = context_layer_intent.view(*new_context_layer_shape_intent)
    return context_layer, context_layer_intent

#### CoinBert Transformer Model

In [9]:
class CBTModel(nn.Module):
  def __init__(self, hidden_dim, batch_size, max_length, n_class, n_tag, dropout1, dropout2, dropout3, dropout4, emb_dim, pretrainedModel=None, pretrainedType="distil", gloveEmbedding=None, use_gpu=False):
    super(CBTModel, self).__init__()
    self.hidden_dim = hidden_dim
    self.batch_size = batch_size
    self.max_length = max_length
    self.n_class = n_class
    self.n_tag = n_tag
    self.emb_dropout_p = dropout1
    self.bert_dropout_p = dropout2
    self.lstm_dropout_p = dropout3
    self.attention_dropout_p = dropout4
    self.LayerNorm = LayerNorm(self.hidden_dim, eps=1e-12)
    self.emb_dropout = nn.Dropout(self.emb_dropout_p)
    if pretrainedModel:
      if pretrainedType=="distil":
        self.bertLayer = DistilBertForFeatureExtraction(pretrainedModel, dropout_p=self.bert_dropout_p, emb_dim=emb_dim)
      else:
        self.bertLayer = BertForFeatureExtraction(pretrainedModel, dropout_p=self.bert_dropout_p, emb_dim=emb_dim)
    if gloveEmbedding:
      self.gloveLayer = nn.Embedding.from_pretrained(torch.tensor(gloveEmbedding, dtype=torch.float), padding_idx=0)
      self.gloveLayer.weight.requires_grad = True
    self.recurNetLayer = RecurNetLayer(emb_dim, hidden_dim // 2, self.max_length, bidirectional=True, batch_first=True, dropout=self.lstm_dropout_p, num_layers=1)
    self.intent_fc = nn.Linear(self.hidden_dim, self.n_class)
    self.slot_fc = nn.Linear(self.hidden_dim, self.n_tag)
    self.ciLayer = CILayer(self.intent_fc, self.slot_fc, self.hidden_dim, self.attention_dropout_p, use_gpu)
    self.crflayer = CRF(self.n_tag) # number of slot tags
    self.loss_fnc = nn.CrossEntropyLoss()

  def forward_bert(self, input_ids, attention_masks, mask):
    '''
    Parameters
    ----------
    input_ids: shape (batch_size, seq_length, emb_dim)
      The idx of input sequence

    attention_masks: shape (batch_size, seq_length, emb_dim)
      The attention masks for the input sequence

    mask: shape (batch_size, seq_length)
      The mask for the input sequence, same as attention masks

    Return
    ------
    logits_intent: shape (batch_size, n_intent_label)
      The probability distribution of intent labels

    logits_slot: shape (batch_size, seq_length, n_slot_label)
      The probability distribution of slot labels
    '''
    x_len = torch.sum(attention_masks !=0, dim=-1)

    # Bert Layer
    last_hidden_state = self.bertLayer(input_ids, attention_masks)
    x_emb = self.emb_dropout(last_hidden_state)

    # Shared Encoder Layer
    H, (_, _) = self.recurNetLayer(x_emb, x_len)

    # Co-Interactive Attention Layer
    H_I, H_S = self.ciLayer(H, mask)

    # Decoder Layer
    '''
    The max pooled followed "Convolutional Neural Networks for Sentence Classification"
    by Yoon Kim 2014, to obtain the sent representation c
    '''
    intent_input = F.max_pool1d((H_I + H).transpose(1, 2), H_I.size(1)).squeeze(2)
    logits_intent = self.intent_fc(intent_input)
    logits_slot = self.slot_fc(H_S + H)

    return logits_intent, logits_slot

  def forward_glove(self, x, mask):
    '''
    Parameters
    ----------
    x: shape (batch_size, seq_length, emb_dim)
      The sequence input (glove idx)

    mask: shape (batch_size, seq_length)
      The mask for input sequence

    Return
    ------
    logits_intent: shape (batch_size, n_intent_label)
      The probability distribution of intent labels

    logits_slot: shape (batch_size, seq_length, n_slot_label)
      The probability distribution of slot labels
    '''
    x, x_char = x
    x_len = torch.sum(x != 0, dim=-1) # length of sequence for each batch [32, 12] <- batch 1 have seq len 32
    x_emb = self.emb_dropout(self.gloveLayer(x))

    # Shared Encoder Layer
    H, (_, _) = self.recurNetLayer(x_emb, x_len)

    # Co-Interactive Attention Layer
    H_I, H_S = self.ciLayer(H, mask)

    # Decoder Layer
    '''
    The max pooled followed "Convolutional Neural Networks for Sentence Classification"
    by Yoon Kim 2014, to obtain the sent representation c
    '''
    intent_input = F.max_pool1d((H_I + H).transpose(1, 2), H_I.size(1)).squeeze(2)
    logits_intent = self.intent_fc(intent_input)
    logits_slot = self.slot_fc(H_S + H)

    return logits_intent, logits_slot


  def loss1(self, logits_intent, logits_slot, intent_label, slot_label, mask):
    '''
    The intent detection used crossentropy on the max pooled representation
    where y^predict = softmax(W^Ic + bS), o^I = argmax(y^predict)

    The slot filling Followed "A Novel Bi-directional Interrelated Model for Joint Intent Detection and Slot Filling"
    by Haihong, Peiqing, Zhongfu, and Meina, 2019

    Parameters
    ----------
    logits_intent: shape (batch_size, seq_length, n_intent_label)
      The probability distribution of intent tag

    logits_slot: shape (batch_size, seq_length, n_slot_label)
      The probability distribution of slot tag

    intent_label: shape (batch_size, intent_label)
      The actual intent label

    slot_label: shape (batch_size, slot_label)
      The actual slot label

    Return
    ------
    loss_intent: float
      intent loss

    loss_slot: float
      slot loss
    '''
    mask = mask[:, 0:logits_slot.size(1)]
    slot_label = slot_label[:, 0:logits_slot.size(1)]

    # Shifting bert subtoken
    subtoken_mask = (slot_label[:,:] == -100)
    slot_label = torch.where(subtoken_mask, torch.tensor(0), slot_label)
    mask = torch.where(subtoken_mask, torch.tensor(0), mask)
    # Move subtoken to behind
    slot_label = torch.gather(slot_label, 1, slot_label.ne(0).argsort(dim=1, descending=True, stable=True))
    mask = torch.gather(mask, 1, mask.ne(0).argsort(dim=1, descending=True, stable=True))

    # sorted_slot_label, sorted_indices = torch.sort(slot_label, dim=1, descending=True)
    # slot_label = torch.gather(sorted_slot_label, 1, sorted_indices)
    # sorted_mask, sorted_mask_indices = torch.sort(mask, dim=1, descending=True)
    # mask = torch.gather(sorted_mask, 1, sorted_mask_indices)
    # Shift logits_slot to match shifted label
    shft_logits_slot = []
    for i in range(len(subtoken_mask)):
      valid_indices = []
      ignore_indices = []
      for j in range(len(subtoken_mask[i])):
        if subtoken_mask[i][j] == True:
          ignore_indices.append(logits_slot[i][j])
        else:
          valid_indices.append(logits_slot[i][j])

      valid_indices = torch.stack(valid_indices)
      if len(ignore_indices) > 0:
        ignore_indices = torch.stack(ignore_indices)
        shft_logits_slot.append(torch.cat((valid_indices, ignore_indices), dim=0))
      else:
        shft_logits_slot.append(valid_indices)

    logits_slot = torch.stack(shft_logits_slot)

    logits_slot = logits_slot.transpose(1, 0) # transpose to seq, batch, tag
    slot_label = slot_label.transpose(1, 0)
    mask = mask.transpose(1, 0)

    loss_intent = self.loss_fnc(logits_intent, intent_label)
    loss_slot = -self.crflayer(logits_slot, slot_label, mask) / logits_intent.size()[0]
    return loss_intent, loss_slot


  def pred_intent_slot(self, logits_intent, logits_slot, mask):
    """
    Obtain the predicted intent and slot label

    Parameter
    --------
    logits_intent:
      intent label distribution

    logits_slot:
      slot label distribution

    mask: shape (batch_size, seq_length)
      Attention Mask

    Return
    ------
    pred_intent: shape (batch_size, 1)
      Predicted intent label

    pred_slot: shape (batch_size, seq_length)
      Predicted slot label
    """
    mask = mask[:, 0:logits_slot.size(1)]
    mask = mask.transpose(1, 0)
    logits_slot = logits_slot.transpose(1, 0)
    pred_intent = torch.max(logits_intent, 1)[1]
    pred_slot = self.crflayer.decode(logits_slot, mask=mask)
    return pred_intent, pred_slot


## Main

#### Data Loader

###### Utils

In [10]:
def prepare_labels(file_path):
  idx2intent, intent2idx = load_label_dict(file_path + "intent_label.txt")
  idx2slot, slot2idx = load_label_dict(file_path + "slot_label.txt")
  n_slot_tag = len(idx2slot.items())
  n_intent_class = len(idx2intent.items())
  return idx2intent, intent2idx, idx2slot, slot2idx, n_slot_tag, n_intent_class

def load_label_dict(file_path):
  """
  Load Intent and Slot Dictionary
  """
  label2id = {}
  id2label = {}
  f = open(file_path, 'r', encoding='UTF-8')
  for item in f:
    id, label = item.strip().split("\t")
    label2id[label] = int(id)
    id2label[int(id)] = label
  f.close()
  return id2label, label2id

def remove_punc(seqs):
  """
  Second stage of punctuation removal, mainly to remove punctuation in
  date, type data.
  """
  new_seqs = []
  for seq in seqs:
    tokens = seq.split(" ")

    new_tokens = []
    for token in tokens:
      token = token.replace("’", "") # join apostrophe character (e.g it’s -> its)
      token = token.replace("'", "") # join apostrophe character (e.g it's -> its)
      translator = str.maketrans("", "", s.punctuation)
      token = token.translate(translator)
      token = ' '.join(token.split()) # remove excessive white space

      if token != "" and token != " ":
        new_tokens.append(token)

    new_seqs.append(' '.join(new_tokens))
  return new_seqs

def create_dataset(**kwargs):
  slot_lists = kwargs.get('slot_lists', None)
  intent_lists = kwargs.get('intent_lists', None)
  mask_lists = kwargs.get('mask_lists', None)
  # Bert Dataset
  input_ids = kwargs.get('input_ids', None)
  attention_masks = kwargs.get('attention_masks', None)
  subtoken_mask_lists = kwargs.get('subtoken_mask_lists', None)
  # Glove Dataset
  token_lists = kwargs.get('token_lists', None)
  char_lists = kwargs.get('char_lists', None)

  if input_ids is None:
    return GloveDataset(token_lists, char_lists, slot_lists, intent_lists, mask_lists)
  return BertDataset(input_ids, attention_masks, slot_lists, intent_lists, mask_lists, subtoken_mask_lists)

# ---Context Dependent Embedding Processing Functions---
def generate_subword_mask(slot_list):
  return [1 if sl != -100 and sl != 0 else 0 for sl in slot_list]

def align_labels(input_ids, max_length, slot_labels, tokenizer):
  '''
  Since bert uses subword piece, need to re-align the slot tag. Accroding to hugging
  face documentation. One solution is to assign -100 to subword accounting only the main word
  as true with true label

  https://huggingface.co/docs/transformers/tasks/token_classification
  '''
  fake_label = "I-fake_label"
  ids2tokens = tokenizer.convert_ids_to_tokens(input_ids)

  aligned_labels = slot_labels
  for i, idx in enumerate(input_ids):
    if ids2tokens[i][:2] == "##":
      aligned_labels.insert(i, fake_label)

  if len(aligned_labels) > max_length - 2: # For [CLS] and [SEP]
    aligned_labels = aligned_labels[0 : (max_length - 2)]
  return aligned_labels

def process_ctx_dp_emb(special_token, emb_dim):
  '''
  BERT handle these special token by itself,
  but can customize the weight for such token to follow the original
  paper settings
  '''
  embeddings = {}
  # embeddings["[PAD]"] = np.zeros(emb_dim) # bert id = 0
  embeddings["[UNK]"] = np.random.uniform(-0.01, 0.01, size = emb_dim) # bert id = 100
  embeddings["[CLS]"] = np.random.uniform(-0.01, 0.01, size = emb_dim) # bert id = 101
  embeddings["[SEP]"] = np.random.uniform(-0.01, 0.01, size = emb_dim) # bert id = 102
  # embeddings["[MASK]"] = np.random.uniform(-0.01, 0.01, size = emb_dim) # bert id = 103
  return embeddings.get(special_token, None)

def load_data_ctx_dp(file_path, max_length, intent2idx, slot2idx, tokenizer):
  '''
  REFERENCES -
  https://huggingface.co/docs/transformers/v4.37.0/en/main_classes/tokenizer#transformers.BatchEncoding
  https://huggingface.co/docs/transformers/main_classes/tokenizer
  '''
  # use bert tokenizer to encode (seq, slot tag, intent) <- token type
  char2idx = {"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9,"j":10,"k":11,"l":12,"m":13,"n":14,
            "o":15,"p":16,"q":17,"r":18,"s":19,"t":20,"u":21,"v":22,"w":23,"x":24,"y":25,"z":26,"'":27,"unk":28}
  file_seq = open(os.path.join(file_path, 'seq.txt'), encoding='utf-8')
  file_intent = open(os.path.join(file_path, 'intent.txt'), encoding='utf-8')
  file_slot = open(os.path.join(file_path, 'slot.txt'), encoding='utf-8')
  content_seq = file_seq.readlines()
  content_intent = file_intent.readlines()
  content_slot = file_slot.readlines()
  file_seq.close()
  file_intent.close()
  file_slot.close()
  slot_lists, intent_lists, mask_lists, subtoken_mask_lists = [], [], [], []
  char_lists = []
  exceed_len = 0 # amount of sents exceed max input length
  max_len_word = 0

  try:
    assert len(content_intent)==len(content_seq) and len(content_slot)==len(content_seq)
  except AssertionError:
    print(f"Error: Lengths are not equal between seq, intent, slot from file_path")

  # remove "\n" character
  content_seq = [t[:-1] for t in content_seq]

  # remove punc
  content_seq = remove_punc(content_seq)

  bert_encodings = tokenizer.batch_encode_plus(
      content_seq,
      add_special_tokens = True,
      max_length = max_length,
      padding='max_length',
      return_attention_mask = True,
      return_tensors = 'pt',
      truncation = True
  )

  for idx, seq_line in enumerate(content_seq):
    mask_list = bert_encodings['attention_mask'][idx].squeeze(0) # mask_list = [1] * len(token_list)

    # Encode Intent Label
    intent = content_intent[idx].strip().split(" ")[0]

    slot_labels = content_slot[idx].strip().split(" ")

    # Align Slot tag and sequence with subwords (##)
    slot_labels = align_labels(bert_encodings['input_ids'][idx], max_length, slot_labels, tokenizer)

    # Encode Slot Label
    slot_list =  [slot2idx[l] if l != 'I-fake_label' else -100 for l in slot_labels]
    slot_list = [slot2idx["<start>"]] + slot_list + [slot2idx["<end>"]]

    # PAD Slot
    while len(slot_list) < max_length:
      slot_list.append(slot2idx["<PAD>"])

    # Generate Subtoken Mask
    subtoken_mask_list = generate_subword_mask(slot_list)

    try:
      assert bert_encodings['input_ids'][idx].shape[0]==max_length and bert_encodings['attention_mask'][idx].shape[0]==max_length and len(slot_list)==max_length and mask_list.shape[0]==max_length and len(subtoken_mask_list)==max_length
    except AssertionError:
      print(f"Error: Unequal token_list, slot_list, mask_list length with max_length")

    intent_lists.append(intent2idx[intent])
    slot_lists.append(slot_list)
    mask_lists.append(np.array(mask_list))
    subtoken_mask_lists.append(subtoken_mask_list)
  return bert_encodings['input_ids'], bert_encodings['attention_mask'], slot_lists, intent_lists, mask_lists, subtoken_mask_lists

# ---Context Independent Embedding Processing Functions---
def load_glove_embedding(file_path, args):
  vocab_file = open(file_path + "vocab.txt", "r", encoding="utf-8")
  vocab_list = [word.strip() for word in vocab_file]
  if not os.path.exists(file_path + "emb_word.txt"):
    emb_file = "glove.6B.300d.txt"
    embeddings = get_ctx_indp_emb(emb_file, vocab_list)
    emb_write = open(file_path + "emb_word.txt", "w", encoding="utf-8")
    for emb in embeddings:
      emb_write.write(emb)
    emb_write.close()
  else:
    embedding_file = open(file_path + "emb_word.txt", "r", encoding="utf-8")
    embeddings = [emb.strip() for emb in embedding_file]
  embedding_word, vocab = process_ctx_indp_emb(embeddings, emb_dim=args.emb_dim)
  return embedding_word, vocab

def process_ctx_indp_emb(embedding, emb_dim):
  embeddings = {}
  embeddings["<pad>"] = np.zeros(emb_dim)
  embeddings["<unk>"] = np.random.uniform(-0.01, 0.01, size = emb_dim)
  embeddings["</s>"] = np.random.uniform(-0.01, 0.01, size = emb_dim)
  embeddings["</e>"] = np.random.uniform(-0.01, 0.01, size = emb_dim)
  for emb in embedding:
      line = emb.strip().split()
      word = line[0]
      word_emb = np.array([float(_) for _ in line[1:]])
      embeddings[word] = word_emb
  vocab_list = list(embeddings.keys())
  word2id ={vocab_list[i]:i for i in range(len(vocab_list))}
  embedding_matrix = np.array(list(embeddings.values()))
  return  embedding_matrix, word2id

def get_ctx_indp_emb (file_path, vocab_list):
  embeddings = []
  with open(file_path, 'r', encoding='UTF-8') as f:
    for emb in tqdm(f, "get_ctx_indp_emb"):
      emb_list = emb.strip().split(" ")
      if emb_list[0] in vocab_list:
        embeddings.append(emb)
  return embeddings

def load_data_ctx_indp(file_path, max_length, intent2idx, slot2idx, vocab):
  """
  :param file_path: data file location
  :param max_length: maximum allowed input sequence length
  :param intent2idx: encoding for intent
  :param slot2idx: encoding for slot tag
  :param vocab: dictionary
  """
  char2idx = {"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,"h":8,"i":9,"j":10,"k":11,"l":12,"m":13,"n":14,
              "o":15,"p":16,"q":17,"r":18,"s":19,"t":20,"u":21,"v":22,"w":23,"x":24,"y":25,"z":26,"'":27,"unk":28}
  file = open(file_path, encoding='utf-8')
  content = file.readlines()
  file.close()
  token_lists, slot_lists, intent_lists, mask_lists = [], [], [], []
  char_lists = []
  slot_outs = []
  token_list, slot_list = [], []
  slot_out = []
  exceed_len = 0 # amount of sents exceed max input length
  query_list = []
  max_len_word = 0
  for idx, line in enumerate(content):
    line = line.strip()
    if line != "":
      line = line.split(" ")
      if len(line) == 1: # intent_label.txt
        intent = line[0]
        intent_lists.append(intent2idx[intent])
      if len(line) == 2: # data file - (token, tag) pairs
        token, slot = line[0], line[1]
        max_len_word = max(max_len_word, len(token))
        if token not in vocab:
          token_list.append(vocab["<unk>"])
        else:
          token_list.append(vocab[token])
        query_list.append(token)
        slot_list.append(slot2idx[slot])
        slot_out.append(slot)
    else:
      '''
      When reach blank line, check the length of utterance,
      1. add BOS & EOS token
      2. pad if < max length - 2 (BOS, EOS | </s>, </e>)
      '''
      if len(token_list) > max_length - 2:
        token_list = token_list[0 : (max_length - 2)]
        query_list = query_list[0 : (max_length - 2)]
        exceed_len += 1
      slot_list = slot_list[0: (max_length - 2)]
      slot_outs.append(slot_out)
      char_list = []
      for token in query_list:
          chars = [char2idx[c] if c in char2idx else char2idx["unk"] for c in list(token)]
          if len(chars)<25:
              chars= chars + (25-len(token))*[0]
          else:
              chars=chars[:25]
          char_list.append(chars)
      char_list = [25*[0]] + char_list + [25*[0]]
      token_list = [vocab["</s>"]] + token_list + [vocab["</e>"]]
      slot_list = [slot2idx["<start>"]] + slot_list + [slot2idx["<end>"]]
      mask_list = [1] * len(token_list)
      while len(token_list) < max_length:
        char_list.append(25*[0]) # a - z
        token_list.append(0)
        slot_list.append(slot2idx["<PAD>"])
        mask_list.append(0)
      try:
        assert len(token_list)==max_length and len(slot_list) == max_length and len(mask_list)==max_length
        assert len(char_list)==max_length
      except AssertionError:
        print(f"Error: Unequal token_list, slot_list, mask_list, char_list length with max_length")
      token_lists.append(token_list)
      slot_lists.append(slot_list)
      mask_lists.append(mask_list)
      char_lists.append(char_list)
      query_list = []
      token_list, slot_list, slot_out = [], [], []
  return token_lists, char_lists, slot_lists, intent_lists, mask_lists

###### Datasets

In [11]:
class BertDataset(TensorDataset):
  def __init__(self, input_ids, attention_masks, slot_lists, intent_lists, mask_lists, subtoken_mask_lists):
    self.input_ids = input_ids
    self.attention_masks = attention_masks
    self.slot_lists = torch.tensor(slot_lists, dtype=torch.int64)
    self.intent_lists = torch.tensor(intent_lists, dtype=torch.int64)
    self.mask_lists = torch.tensor(mask_lists)
    self.subtoken_mask_lists = torch.tensor(subtoken_mask_lists, dtype=torch.int64)

  def __len__(self):
    return int((len(self.input_ids) + len(self.attention_masks) + len(self.slot_lists) + len(self.intent_lists) + len(self.mask_lists) + len(self.subtoken_mask_lists))/ 6)

  def __getitem__(self, index):
    return self.input_ids[index], self.attention_masks[index], self.slot_lists[index], self.intent_lists[index], self.mask_lists[index], self.subtoken_mask_lists[index]


class GloveDataset(TensorDataset):
  def __init(self, token_lists, char_lists, slot_lists, intent_lists, mask_lists):
    self.token_lists = torch.tensor(token_lists, dtype=torch.int64)
    self.char_lists = torch.tensor(char_lists, dtype=torch.int64)
    self.slot_lists = torch.tensor(slot_lists, dtype=torch.int64)
    self.intent_lists = torch.tensor(intent_lists, dtype=torch.int64)
    self.mask_lists = torch.tensor(mask_lists, dtype=torch.int64)

  def __len__(self):
    return int((len(self.token_lists) + len(self.char_lists) + len(self.slot_lists) + len(self.intent_lists) + len(self.mask_lists)) / 5)

  def __getitem__(self, index):
    return self.token_lists[index], self.char_lists[index], self.slot_lists[index], self.intent_lists[index], self.mask_lists[index]

#### Train

###### Optimizer

In [12]:
'''
REFERENCES -
https://pytorch.org/docs/stable/_modules/torch/optim/radam.html#RAdam
https://pytorch.org/docs/stable/_modules/torch/optim/adamw.html#AdamW
https://github.com/kangbrilliant/DCA-Net/blob/master/model/Radam.py
'''
class RAdam(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))

        self.degenerated_to_sgd = degenerated_to_sgd

        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)

        self.buffer = [[None, None, None] for ind in range(10)]

        super(RAdam, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(RAdam, self).__setstate__(state)

    def step(self, closure=None):
        loss = None

        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:

                if p.grad is None:
                    continue

                grad = p.grad.data.float()

                if grad.is_sparse:
                    raise RuntimeError('RAdam does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']

                beta1, beta2 = group['betas']

                # exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) # changes

                # exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # changes

                state['step'] += 1

                buffered = self.buffer[int(state['step'] % 10)]

                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    if N_sma >= 5:
                        step_size = math.sqrt(
                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
                                        N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    elif self.degenerated_to_sgd:
                        step_size = 1.0 / (1 - beta1 ** state['step'])
                    else:
                        step_size = -1
                    buffered[2] = step_size

                if N_sma >= 5:
                    if group['weight_decay'] != 0:
                        # p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                        p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr']) # changes
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    # p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
                    p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) # changes
                    p.data.copy_(p_data_fp32)
                elif step_size > 0:
                    if group['weight_decay'] != 0:
                        # p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
                        p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr']) # changes
                    # p_data_fp32.add_(-step_size * group['lr'], exp_avg)
                    p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr']) # changes
                    p.data.copy_(p_data_fp32)

        return loss

class AdamW(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup=0):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))

        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, warmup=warmup)

        super(AdamW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW, self).__setstate__(state)

    def step(self, closure=None):
        loss = None

        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:

                if p.grad is None:
                    continue

                grad = p.grad.data.float()

                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']

                beta1, beta2 = group['betas']

                state['step'] += 1

                # exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) # changes

                # exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # changes

                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']

                bias_correction2 = 1 - beta2 ** state['step']

                if group['warmup'] > state['step']:
                    scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup']
                else:
                    scheduled_lr = group['lr']

                step_size = scheduled_lr * math.sqrt(bias_correction2) / bias_correction1

                if group['weight_decay'] != 0:
                    # p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32)
                    p_data_fp32.add_( p_data_fp32, alpha=-group['weight_decay'] * scheduled_lr) # changes

                # p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
                p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size) # changes

                p.data.copy_(p_data_fp32)

        return loss

###### Evaluation Metrics

In [13]:
class IntentMetrics(object):
    def __init__(self, intent_pred, intent_true):
        self.accuracy = sk_metrics.accuracy_score(intent_true, intent_pred)
        self.precision = sk_metrics.precision_score(intent_true, intent_pred, average="macro")
        self.recall = sk_metrics.recall_score(intent_true, intent_pred, average="macro")
        self.classification_report = sk_metrics.classification_report(intent_true, intent_pred)

class SlotMetrics(object):
    def __init__(self, correct_slots, pred_slots):
        self.correct_slots = correct_slots
        self.pred_slots = pred_slots

    def get_slot_metrics(self):
        correctChunk = {}
        correctChunkCnt = 0.0
        foundCorrect = {}
        foundCorrectCnt = 0.0
        foundPred = {}
        foundPredCnt = 0.0
        correctTags = 0.0
        tokenCount = 0.0
        for correct_slot, pred_slot in zip(self.correct_slots, self.pred_slots):
            inCorrect = False
            lastCorrectTag = 'O'
            lastCorrectType = ''
            lastPredTag = 'O'
            lastPredType = ''
            for c, p in zip(correct_slot, pred_slot):
                correctTag, correctType = SlotMetrics.splitTagType(c)
                predTag, predType = SlotMetrics.splitTagType(p)

                if inCorrect == True:
                    if SlotMetrics.endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
                            SlotMetrics.endOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
                            (lastCorrectType == lastPredType):
                        inCorrect = False
                        correctChunkCnt += 1.0
                        if lastCorrectType in correctChunk:
                            correctChunk[lastCorrectType] += 1.0
                        else:
                            correctChunk[lastCorrectType] = 1.0
                    elif SlotMetrics.endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) != \
                            SlotMetrics.endOfChunk(lastPredTag, predTag, lastPredType, predType) or \
                            (correctType != predType):
                        inCorrect = False

                if SlotMetrics.startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \
                        SlotMetrics.startOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \
                        (correctType == predType):
                    inCorrect = True

                if SlotMetrics.startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True:
                    foundCorrectCnt += 1
                    if correctType in foundCorrect:
                        foundCorrect[correctType] += 1.0
                    else:
                        foundCorrect[correctType] = 1.0

                if SlotMetrics.startOfChunk(lastPredTag, predTag, lastPredType, predType) == True:
                    foundPredCnt += 1.0
                    if predType in foundPred:
                        foundPred[predType] += 1.0
                    else:
                        foundPred[predType] = 1.0

                if correctTag == predTag and correctType == predType:
                    correctTags += 1.0

                tokenCount += 1.0

                lastCorrectTag = correctTag
                lastCorrectType = correctType
                lastPredTag = predTag
                lastPredType = predType

            if inCorrect == True:
                correctChunkCnt += 1.0
                if lastCorrectType in correctChunk:
                    correctChunk[lastCorrectType] += 1.0
                else:
                    correctChunk[lastCorrectType] = 1.0

        if foundPredCnt > 0:
            precision = 1.0 * correctChunkCnt / foundPredCnt
        else:
            precision = 0

        if foundCorrectCnt > 0:
            recall = 1.0 * correctChunkCnt / foundCorrectCnt
        else:
            recall = 0

        if (precision + recall) > 0:
            f1 = (2.0 * precision * recall) / (precision + recall)
        else:
            f1 = 0

        return f1, precision, recall

    @staticmethod
    def startOfChunk(prevTag, tag, prevTagType, tagType, chunkStart=False):
        if prevTag == 'B' and tag == 'B':
            chunkStart = True
        if prevTag == 'I' and tag == 'B':
            chunkStart = True
        if prevTag == 'O' and tag == 'B':
            chunkStart = True
        if prevTag == 'O' and tag == 'I':
            chunkStart = True

        if prevTag == 'E' and tag == 'E':
            chunkStart = True
        if prevTag == 'E' and tag == 'I':
            chunkStart = True
        if prevTag == 'O' and tag == 'E':
            chunkStart = True
        if prevTag == 'O' and tag == 'I':
            chunkStart = True

        if tag != 'O' and tag != '.' and prevTagType != tagType:
            chunkStart = True
        return chunkStart

    @staticmethod
    def endOfChunk(prevTag, tag, prevTagType, tagType, chunkEnd=False):
        if prevTag == 'B' and tag == 'B':
            chunkEnd = True
        if prevTag == 'B' and tag == 'O':
            chunkEnd = True
        if prevTag == 'I' and tag == 'B':
            chunkEnd = True
        if prevTag == 'I' and tag == 'O':
            chunkEnd = True

        if prevTag == 'E' and tag == 'E':
            chunkEnd = True
        if prevTag == 'E' and tag == 'I':
            chunkEnd = True
        if prevTag == 'E' and tag == 'O':
            chunkEnd = True
        if prevTag == 'I' and tag == 'O':
            chunkEnd = True

        if prevTag != 'O' and prevTag != '.' and prevTagType != tagType:
            chunkEnd = True
        return chunkEnd

    @staticmethod
    def splitTagType(tag):
        s = tag.split('-')
        if len(s) > 2 or len(s) == 0:
            raise ValueError('tag format wrong. it must be B-xxx.xxx')
        if len(s) == 1:
            tag = s[0]
            tagType = ""
        else:
            tag = s[0]
            tagType = s[1]
        return tag, tagType


def semantic_acc(pred_slot, real_slot, pred_intent, real_intent):
    """
    Compute the accuracy based on the whole predictions of
    given sentences, including slot and intent.
    """
    total_count, correct_count = 0.0, 0.0
    for p_slot, r_slot, p_intent, r_intent in zip(pred_slot, real_slot, pred_intent, real_intent):

        if p_slot == r_slot and p_intent == r_intent:
            correct_count += 1.0
        total_count += 1.0

    return 1.0 * correct_count / total_count


###### Agent

In [14]:
class Agent:
  def __init__(
      self,
      model: torch.nn.Module,
      train_loader: DataLoader,
      eval_loader: DataLoader,
      test_loader: DataLoader,
      optimizer: torch.optim.Optimizer,
      idx2intent,
      idx2slot,
      args,
      scheduler_type = 'multisteplr'
  ) -> None:
    self.model = model
    self.train_loader = train_loader
    self.eval_loader = eval_loader
    self.optimizer = optimizer
    self.idx2intent = idx2intent
    self.idx2slot = idx2slot
    self.args = args
    self.device = self.args.device
    self.tb_writer = SummaryWriter(self.args.log_dir)
    self.bert_distil = self.args.bert_distil

    if args.use_bert:
      self.model.forward = self.model.forward_bert
    else:
      self.model.forward = self.model.forward_glove

    if self.args.cuda:
      self.model = self.model.to(self.device)

    self._init_scheduler(scheduler_type)

  def _init_scheduler(self, type):
    if type == 'multisteplr':
      self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, [40, 70], gamma=self.args.lr_scheduler_gama, last_epoch=-1)

  def _run_batch_train(self, batch, epoch):
    self.model.train()
    self.model.zero_grad()
    if self.args.use_bert:
      input_ids, attention_masks, slot_labels, intent_labels, masks, subtoken_mask_lists = batch
      logits_intent, logits_slot = self.model(input_ids, attention_masks, masks)
    else:
      inputs, char_lists, slot_labels, intent_labels, masks = batch
      logits_intent, logits_slot = self.model((inputs, char_lists), masks)
    loss_intent, loss_slot, = self.model.loss1(logits_intent, logits_slot, intent_labels, slot_labels, masks)
    if epoch < 40:
      loss = loss_slot + loss_intent
    else:
      loss = 0.8 * loss_intent + 0.2 * loss_slot
    loss.backward()
    self.optimizer.step()
    return loss

  def _run_batch_eval(self, batch):
    self.model.eval()
    if self.args.use_bert:
      input_ids, attention_masks, slot_labels, intent_labels, masks, subtoken_mask_lists = batch
      logits_intent, logits_slot = self.model(input_ids, attention_masks, masks)
    else:
      inputs, char_lists, slot_labels, intent_labels, masks = batch
      logits_intent, logits_slot = self.model((inputs, char_lists), masks)
    loss_intent, loss_slot = self.model.loss1(logits_intent, logits_slot, intent_labels, slot_labels, masks)
    pred_intent, pred_slot = self.model.pred_intent_slot(logits_intent, logits_slot, masks)

    return loss_intent, loss_slot, pred_intent, pred_slot, intent_labels, slot_labels

  def _run_batch_test(self, batch):
    self.model.eval()
    if self.args.use_bert:
      input_ids, attention_masks, slot_labels, intent_labels, masks, subtoken_mask_lists = batch
      logits_intent, logits_slot = self.model(input_ids, attention_masks, masks)
    else:
      inputs, char_lists, slot_labels, intent_labels, masks = batch
      logits_intent, logits_slot = self.model((inputs, char_lists), masks)
    pred_intent, pred_slot = self.model.pred_intent_slot(logits_intent, logits_slot, masks)
    return pred_intent, pred_slot, intent_labels, slot_labels

  def _run_epoch(self, epoch):
    print(self.scheduler.get_last_lr())
    # ---Training---
    step = 0
    b_sz_train = self.train_loader.batch_size
    print(f"[Device{self.device}] Epoch: {epoch} | Batchsize: {b_sz_train} | Steps: {len(self.train_loader)} | Loader: Train Loader")
    for idx, batch in enumerate(tqdm(self.train_loader, desc="Training")):
      step += 1
      batch = tuple(tensor.to(self.device) for tensor in batch)
      loss = self._run_batch_train(batch, epoch)
      if step % 100 == 0 and len(self.train_loader) >= 100:
        logger.info('epoch: {}|    step: {} |    loss: {}'.format(epoch, step, loss.item()))
      elif step % 50 == 0 and len(self.train_loader) < 100:
        logger.info('epoch: {}|    step: {} |    loss: {}'.format(epoch, step, loss.item()))

    # ---Eval---
    eval_loss_intent = 0
    eval_loss_slot = 0
    pred_intents = []
    actual_intents = []
    pred_slots = []
    actual_slots = []
    for idx, batch in enumerate(tqdm(self.eval_loader, desc="Evaluating")):
      batch = tuple(tensor.to(self.device) for tensor in batch)
      loss_intent, loss_slot, pred_intent, pred_slot, intent_labels, slot_labels = self._run_batch_eval(batch)

      pred_intents.extend(pred_intent.cpu().numpy().tolist())
      actual_intents.extend(intent_labels.cpu().numpy().tolist())

      eval_loss_intent += loss_intent.item()
      eval_loss_slot += loss_slot.item()

      slot_labels = slot_labels.cpu().numpy().tolist()

      for i in range(len(pred_slot)):
        pred = []
        actual = []
        for j in range(len(pred_slot[i])):
          if slot_labels[i][j] != -100:
            pred.append(self.idx2slot[pred_slot[i][j].item()])
            actual.append(self.idx2slot[slot_labels[i][j]])
        pred_slots.append(pred[1:-1])
        actual_slots.append(actual[1:-1])

    print(actual_slots)
    print(pred_slots)

    # slot filling f1, precision, recall
    slot_metrics = SlotMetrics(actual_slots, pred_slots)
    slot_f1, slot_p, slot_r = slot_metrics.get_slot_metrics()

    # intent accuracy
    Metrics_intent = IntentMetrics(pred_intents, actual_intents)
    intent_acc = Metrics_intent.accuracy

    data_nums = len(self.eval_loader.dataset)
    avg_loss_intent = eval_loss_intent * self.args.batch_size / data_nums
    avg_loss_slot = eval_loss_slot * self.args.batch_size / data_nums

    # overall accuracy
    overall_acc = semantic_acc(pred_slots, actual_slots, pred_intents, actual_intents)

    logger.info('\nEvaluation - intent_loss: {:.6f} slot_loss: {:.6f} acc: {:.4f}% '
        'slot f1: {:.4f} overall acc: {:.4f} \n'.format(avg_loss_intent, avg_loss_slot, intent_acc, slot_f1, overall_acc))
    return intent_acc, slot_f1, overall_acc

  def fit(self, max_epochs: int):
    best_slot_f1 = [0.0, 0.0, 0.0]
    best_intent_acc = [0.0, 0.0, 0.0]
    best_overall_acc = [0.0, 0.0, 0.0]
    for epoch in range(max_epochs):
      intent_acc, slot_f1, overall_acc = self._run_epoch(epoch)
      # save the best model
      if slot_f1 > best_slot_f1[1] :
        # torch.save(model, model_save_dir + model_path)
        best_slot_f1 = [overall_acc, slot_f1, intent_acc, epoch]
      if intent_acc > best_intent_acc[2]:
        # torch.save(model, model_save_dir + model_path)
        best_intent_acc = [overall_acc, slot_f1, intent_acc, epoch]
      if overall_acc > best_overall_acc[0]:
        # torch.save(model, model_save_dir + model_save_location)
        best_overall_acc = [overall_acc, slot_f1, intent_acc, epoch]
      # step scheduler
      self.scheduler.step()

    # Print best model result
    logger.info("best_slot_f1: {}".format(best_slot_f1))
    logger.info("best_intent_acc: {}".format(best_intent_acc))
    logger.info("best_overall_acc: {}".format(best_overall_acc))

    # Save training arguments
    args_path = os.path.join(self.args.save_dir, 'training_args.bin')
    torch.save(self.args, args_path)
    # Close tensorboard writer
    self.tb_writer.close()

  def test(self):
    pred_intents = []
    actual_intents = []
    pred_slots = []
    actual_slots = []
    for idx, batch in enumerate(tqdm(self.test_loader, desc="Testing")):
      batch = tuple(tensor.to(self.device) for tensor in batch)
      pred_intent, pred_slot, intent_labels, slot_labels = self._run_batch_test(batch)

      pred_intents.extend(pred_intent.cpu().numpy().tolist())
      actual_intents.extend(intent_labels.cpu().numpy().tolist())

      slot_labels = slot_labels.cpu().numpy().tolist()

      for i in range(len(pred_slot)):
        pred = []
        actual = []
        for j in range(len(pred_slot[i])):
          if slot_labels[i][j] != -100:
            pred.append(self.idx2slot[pred_slot[i][j].item()])
            actual.append(self.idx2slot[slot_labels[i][j]])
        pred_slots.append(pred[1:-1])
        actual_slots.append(actual[1:-1])

    slot_metrics = SlotMetrics(actual_slots, pred_slots)
    slot_f1, _, _ = slot_metrics.get_slot_metrics()

    Metrics_intent = IntentMetrics(pred_intents, actual_intents)
    logger.info(Metrics_intent.classification_report)

    intent_acc = Metrics_intent.accuracy
    overall_acc = semantic_acc(pred_slots, actual_slots, pred_intents, actual_intents)

    logger.info('\nTest -  acc: {:.4f}% ' 'slot f1: {:.4f} overall_acc: {:.4f}  \n'.format(intent_acc, slot_f1, overall_acc))

#### Runtime

In [15]:
def set_seed(args):
  random.seed(args.seed)
  np.random.seed(args.seed)
  torch.manual_seed(args.seed)

  if args.cuda:
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

def load_args():
  parser = argparse.ArgumentParser(description='CoinBert Transformer Model')
  # device
  parser.add_argument('--gpu', type=int, default=0, help='choose specific gpu to use')
  parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
  parser.add_argument("--save_dir", type=str, default="Friendlink", help="save model location")
  parser.add_argument("--load_dir", type=str, default=None, help="load path")
  parser.add_argument("--use_data", type=str, default="Friendlink", help="the persona dataset you want to use")
  parser.add_argument("--use_gpu", action='store_true', default=True, help="use gpu if available")
  parser.add_argument("--log_dir", type=str, default="logs", help="logging directory")
  # multigpu setting
  parser.add_argument("--use_distributed", action='store_true', default=False, help="distributed training if available")
  parser.add_argument("--distributed", type=int, default=False, help="distributed training")
  parser.add_argument("--snapshot_dir", type=str, default="snapshot", help="snapshot saving location")
  parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
  # hyperparam
  parser.add_argument("--lr", type=float, default=0.001, help="learning rate")
  parser.add_argument("--max_len", type=int, default=64, help="max sequence input (default: 64, 32 for atis benchmarking)")
  parser.add_argument("--num_attention_heads", type=int, default=8, help="number of attention heads")
  parser.add_argument("--bert_dropout_p", type=float, default=0.3, help="dropout for bert layer")
  parser.add_argument("--lstm_dropout_p", type=float, default=0.5, help="dropout for lstm")
  parser.add_argument("--attention_dropout_p", type=float, default=0.1, help="dropout for attention")
  parser.add_argument("--lr_scheduler_gama", type=float, default=0.3, help="gama for scheduler")
  parser.add_argument("--eps", type=float, default=1e-12, help="epiloson")
  # training setting
  parser.add_argument('--seed', type=int, default=9, metavar='S', help='random seed')
  parser.add_argument("--batch_size", type=int, default=32, help="batch size for dialogue training")
  parser.add_argument("--epochs", type=int, default=50, help="number of training epochs")
  parser.add_argument("--use_bert", default=True, help="whether to use bert or glove")
  parser.add_argument('--eval', action='store_true', default=False, help='eval model')
  parser.add_argument("--bert_distil", default=True, help="whether to use distil or base bert")
  parser.add_argument('--is_train', default=True, help='whether to do training or testing')
  args = parser.parse_args(args=[])

  if args.device == "cuda":
    if args.use_gpu == False:
      args.device = "cpu"
      args.cuda = False
    else:
      args.device = args.gpu
      args.cuda = True
  else:
    args.cuda = False

   # check if running on colab
  if os.getenv("COLAB_RELEASE_TAG"):
    from google.colab import drive
    drive.mount('/content/drive')
    args.workdir = "/content/drive/MyDrive/CBT_For_IDSF/"
  else:
   args.workdir = "" # local env

  # hyperparameter for glove and bert
  if args.use_bert:
    args.hidden_dim = 256
    args.emb_dim = 768 # 1024 for bertlarge & elmo
    args.emb_dropout_p = 0.5
  else:
    args.hidden_dim = 128
    args.emb_dim = 300
    args.emb_dropout_p = 0.8

  args.save_dir = os.path.join(args.workdir, f"domain/{args.save_dir}/")
  args.log_dir = os.path.join(args.workdir, f"{args.log_dir}/")

  return argparse.Namespace(**vars(args))

def main():
  args = load_args()
  set_seed(args)
  if args.cuda:
    torch.cuda.set_device(args.device)

  if args.use_bert:
    data_path = os.path.join(args.workdir, f"data/{args.use_data}/{args.use_data.lower()}_bert/")
    embedding_word = None # create empty as placeholder
  else:
    data_path = os.path.join(args.workdir, f"data/{args.use_data}/{args.use_data.lower()}_glove/")
    embedding_word, vocab = load_glove_embedding(data_path, args)

  # prepare labels
  idx2intent, intent2idx, idx2slot, slot2idx, n_slot_tag, n_intent_class = prepare_labels(data_path)

  # load model
  if args.load_dir != None:
    if args.bert_distil:
      tokenizer = DistilBertTokenizer.from_pretrained(os.path.join(args.workdir, f"domain/{args.load_dir}/model/distilbert-base-uncased"))
      model = torch.load(os.path.join(args.workdir, f"domain/{args.load_dir}/model/coinbert-transformer-model.bin"))
    else:
      tokenizer = BertTokenizer.from_pretrained(os.path.join(args.workdir, f"domain/{args.load_dir}/model/bert-base-uncased"))
      model = torch.load(os.path.join(args.workdir, f"domain/{args.load_dir}/model/coinbert-base-transformer-model.bin"))
  else:
    if args.bert_distil:
      tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
      pre_trained_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
      # pre_trained_model = 'distilbert-base-uncased'
      pretrainedType = "distil"
    else:
      tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
      pre_trained_model = BertModel.from_pretrained('bert-base-uncased')
      # pre_trained_model = 'bert-base-uncased'
      pretrainedType = "bert"

    model = CBTModel(
        args.hidden_dim,
        args.batch_size,
        args.max_len,
        n_intent_class,
        n_slot_tag,
        args.emb_dropout_p,
        args.bert_dropout_p,
        args.lstm_dropout_p,
        args.attention_dropout_p,
        args.emb_dim,
        pretrainedModel=pre_trained_model,
        pretrainedType=pretrainedType,
        gloveEmbedding=embedding_word,
        use_gpu=args.cuda
        )

  """the dir is a folder is using bert, text file if using glove"""
  train_dir = os.path.join(data_path, 'train')
  dev_dir = os.path.join(data_path, 'dev')
  test_dir = os.path.join(data_path, 'test')

  # prepare data
  if args.use_bert:
    logger.info("***Building train dataset***")
    input_ids, attention_mask, slot_lists, intent_lists, mask_lists, subtoken_mask_lists = load_data_ctx_dp(train_dir, max_length=args.max_len, intent2idx=intent2idx, slot2idx=slot2idx, tokenizer=tokenizer)
    trainset = create_dataset(
        input_ids=input_ids,
        attention_masks=attention_mask,
        slot_lists=slot_lists,
        intent_lists=intent_lists,
        mask_lists=mask_lists,
        subtoken_mask_lists=subtoken_mask_lists
        )

    logger.info("***Building dev dataset***")
    d_input_ids, d_attention_mask, d_slot_lists, d_intent_lists, d_mask_lists, d_subtoken_mask_lists = load_data_ctx_dp(dev_dir, max_length=args.max_len, intent2idx=intent2idx, slot2idx=slot2idx, tokenizer=tokenizer)
    devset = create_dataset(
        input_ids=d_input_ids,
        attention_masks=d_attention_mask,
        slot_lists=d_slot_lists,
        intent_lists=d_intent_lists,
        mask_lists=d_mask_lists,
        subtoken_mask_lists=d_subtoken_mask_lists
        )

    logger.info("***Building test dataset***")
    t_input_ids, t_attention_mask, t_slot_lists, t_intent_lists, t_mask_lists, t_subtoken_mask_lists = load_data_ctx_dp(test_dir, max_length=args.max_len, intent2idx=intent2idx, slot2idx=slot2idx, tokenizer=tokenizer)
    testset = create_dataset(
        input_ids=t_input_ids,
        attention_masks=t_attention_mask,
        slot_lists=t_slot_lists,
        intent_lists=t_intent_lists,
        mask_lists=t_mask_lists,
        subtoken_mask_lists=t_subtoken_mask_lists
        )
  else:
    logger.info("***Building train dataset***")
    token_lists, char_lists, slot_lists, intent_lists, mask_lists = load_data_ctx_indp(train_dir, max_length=args.max_len, intent2idx=intent2idx, slot2idx=slot2idx, vocab=vocab, is_train=True)
    trainset = create_dataset(
        token_lists=token_lists,
        char_lists=char_lists,
        slot_lists=slot_lists,
        intent_lists=intent_lists,
        mask_lists=mask_lists
        )

    logger.info("***Building dev dataset***")
    d_token_lists, d_char_lists, d_slot_lists, d_intent_lists, d_mask_lists = load_data_ctx_indp(dev_dir, max_length=args.max_len, intent2idx=intent2idx, slot2idx=slot2idx, vocab=vocab, is_train=False)
    devset = create_dataset(
        token_lists=d_token_lists,
        char_lists=d_char_lists,
        slot_lists=d_slot_lists,
        intent_lists=d_intent_lists,
        mask_lists=d_mask_lists
        )

    logger.info("***Building test dataset***")
    t_token_lists, t_char_lists, t_slot_lists, t_intent_lists, t_mask_lists = load_data_ctx_indp(test_dir, max_length=args.max_len, intent2idx=intent2idx, slot2idx=slot2idx, vocab=vocab, is_train=False)
    testset = create_dataset(
        token_lists=t_token_lists,
        char_lists=t_char_lists,
        slot_lists=t_slot_lists,
        intent_lists=t_intent_lists,
        mask_lists=t_mask_lists
        )

  # prepare dataloader
  train_loader =  DataLoader(trainset, sampler=None, shuffle=(not args.distributed), batch_size=args.batch_size)
  dev_loader =  DataLoader(devset, sampler=None, shuffle=False, batch_size=args.batch_size)
  test_loader =  DataLoader(testset, sampler=None, shuffle=False, batch_size=args.batch_size)

  # define optimizer
  optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.000001)

  # initialize agent
  logger.info("***Initiate Training Agent***")
  agent = Agent(model, train_loader, dev_loader, test_loader, optimizer, idx2intent, idx2slot, args)

  # run
  if args.is_train:
    agent.fit(max_epochs=args.epochs)
  else:
    agent.test()

In [16]:
if __name__ == "__main__":
  coloredlogs.install(level='INFO')
  logger = logging.getLogger(__name__)
  main()

Output hidden; open in https://colab.research.google.com to view.

## Prod Env