In [1]:
import fasttext
import fasttext.util
import torch
import json
import fasttext
import torch.nn.functional as F

from torch import nn
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

In [3]:
id_fasttext = fasttext.load_model('./cc.id.300.bin')



In [4]:
embedding_size = 2
hidden_size = 2
expert_output_size = 1
ner_tags = [i for i in range(39)]
n_heads = 2

In [5]:
fasttext.util.reduce_model(id_fasttext,embedding_size)

<fasttext.FastText._FastText at 0x7fd610209fd0>

In [196]:
data = ['Gempa', 'bumi', 'berkekuatan', '5']
t_labels = [38,38,38,15]

data_test = ['Gempa', 'bumi', 'berkekuatan', '75']

data_vector = [id_fasttext[w] for w in data]
data_test = torch.stack([torch.stack([torch.Tensor(id_fasttext[w]) for w in data_test])])

data_vector

[array([0.04448834, 0.00509501], dtype=float32),
 array([0.3491381 , 0.03814312], dtype=float32),
 array([0.06721243, 0.03095353], dtype=float32),
 array([-0.6374125,  1.3449613], dtype=float32)]

In [197]:
data = [data_vector]

length = torch.Tensor([1])

length, sort_idx = torch.sort(length, descending=True)
data = [data[i] for i in sort_idx]

data = [torch.Tensor(seq).float().to(device).clone().detach() for seq in data]

def padding(data):
    padded_data = pad_sequence([torch.tensor(seq) for seq in data], batch_first=True, padding_value=torch.nan)
    return torch.tensor(padded_data)

data = padding(data)

  padded_data = pad_sequence([torch.tensor(seq) for seq in data], batch_first=True, padding_value=torch.nan)
  return torch.tensor(padded_data)


In [120]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size):
        super().__init__()

        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        # Forward forget gate
        self.W_ffx = nn.Parameter(torch.Tensor(hidden_size, embedding_size))
        self.b_ffx = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_ffh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ffh = nn.Parameter(torch.Tensor(hidden_size))
        
        # Forward cell gate
        self.W_cfx = nn.Parameter(torch.Tensor(hidden_size, embedding_size))
        self.b_cfx = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_cfh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_cfh = nn.Parameter(torch.Tensor(hidden_size))
        
        # Forward input gate
        self.W_ifx = nn.Parameter(torch.Tensor(hidden_size, embedding_size))
        self.b_ifx = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_ifh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ifh = nn.Parameter(torch.Tensor(hidden_size))
        
        # Forward output gate
        self.W_ofx = nn.Parameter(torch.Tensor(hidden_size, embedding_size))
        self.b_ofx = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_ofh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ofh = nn.Parameter(torch.Tensor(hidden_size))

        # Backward forget gate
        self.W_fbx = nn.Parameter(torch.Tensor(hidden_size, embedding_size))
        self.b_fbx = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_fbh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_fbh = nn.Parameter(torch.Tensor(hidden_size))
        
        # backward cell gate
        self.W_cbx = nn.Parameter(torch.Tensor(hidden_size, embedding_size))
        self.b_cbx = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_cbh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_cbh = nn.Parameter(torch.Tensor(hidden_size))
        
        # Backward input gate
        self.W_ibx = nn.Parameter(torch.Tensor(hidden_size, embedding_size))
        self.b_ibx = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_ibh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ibh = nn.Parameter(torch.Tensor(hidden_size))
        
        # Backward output gate
        self.W_obx = nn.Parameter(torch.Tensor(hidden_size, embedding_size))
        self.b_obx = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_obh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_obh = nn.Parameter(torch.Tensor(hidden_size))
        
        # is it better or reset every sentece is better?
        self.reset_states()
        
        # Set init weight to normal
        for name,p in self.named_parameters():
            nn.init.normal_(p)
            
    def reset_states(self):
        self.h_fx = torch.zeros(self.hidden_size, device=device)
        self.c_fx = torch.zeros(self.hidden_size, device=device)
        self.h_bx = torch.zeros(self.hidden_size, device=device)
        self.c_bx = torch.zeros(self.hidden_size, device=device)

    def forward(self, x, verbose):
        ''' BiLSTM forward
        Args:
            x (list of tensor): vectorize sentence that has been padded
        Returns:
        
        '''
        
        self.reset_states()
        
        # Lists to store outputs
        outputs_fx, outputs_bx = [], []

        # Forward pass
        for i, x_i in enumerate(x):
            o_seq = []
            p_f = []
            p2_f = []
            p_ft = []
            p_it = []
            p_ot = []
            p_gt = []
            
            for t in range(x_i.size(0)):
                x_t = x[i][t]
                
                if all(x_t.isnan()):
                    o_seq.append(torch.zeros(self.hidden_size).to(device))
                    continue

                # Forward LSTM
                f_t = torch.sigmoid(x_t @ self.W_ffx.t() + self.b_ffx + self.h_fx @ self.W_ffh.t() + self.b_ffh)
                i_t = torch.sigmoid(x_t @ self.W_ifx.t() + self.b_ifx + self.h_fx @ self.W_ifh.t() + self.b_ifh)
                o_t = torch.sigmoid(x_t @ self.W_ofx.t() + self.b_ofx + self.h_fx @ self.W_ofh.t() + self.b_ofh)
                g_t = torch.tanh(x_t @ self.W_cfx.t() + self.b_cfx + self.h_fx @ self.W_cfh.t() + self.b_cfh)

                self.c_fx = f_t * self.c_fx + i_t * g_t
                self.h_fx = o_t * torch.tanh(self.c_fx)
    
                p_f.append(self.c_fx)
                p2_f.append(self.h_fx)
                p_ft.append(f_t)
                p_it.append(i_t)
                p_ot.append(o_t)
                p_gt.append(g_t)
                o_seq.append(self.h_fx)
            
            outputs_fx.append(torch.stack(o_seq))
            
        # Backward pass
        for i, x_i in enumerate(x):
            o_seq = []
            p_b = []
            p2_b = []
            pb_ft = []
            pb_it = []
            pb_ot = []
            pb_gt = []
            
            for t in range(x_i.size(0) - 1, -1, -1):
                x_t = x[i][t]
                
                if all(x_t.isnan()):
                    o_seq.append(torch.zeros(self.hidden_size).to(device))
                    continue

                # Backward LSTM
                f_t = torch.sigmoid(x_t @ self.W_fbx.t() + self.b_fbx + self.h_bx @ self.W_fbh.t() + self.b_fbh)
                i_t = torch.sigmoid(x_t @ self.W_ibx.t() + self.b_ibx + self.h_bx @ self.W_ibh.t() + self.b_ibh)
                o_t = torch.sigmoid(x_t @ self.W_obx.t() + self.b_obx + self.h_bx @ self.W_obh.t() + self.b_obh)
                g_t = torch.tanh(x_t @ self.W_cbx.t() + self.b_cbx + self.h_bx @ self.W_cbh.t() + self.b_cbh)

                self.c_bx = f_t * self.c_bx + i_t * g_t
                self.h_bx = o_t * torch.tanh(self.c_bx)
                
                p2_b.append(self.h_bx)
                p_b.append(self.c_bx)
                pb_ft.append(f_t)
                pb_it.append(i_t)
                pb_ot.append(o_t)
                pb_gt.append(g_t)
                o_seq.append(self.h_bx)
                    
            p_b.reverse()
            p2_b.reverse()
            pb_ft.reverse()
            pb_it.reverse()
            pb_ot.reverse()
            pb_gt.reverse()
            
            o_seq.reverse()
            outputs_bx.append(torch.stack(o_seq))
        
        outputs_fx = torch.stack(outputs_fx)
        outputs_bx = torch.stack(outputs_bx)
        
        # Concatenate hidden states from both directions
        outputs = torch.cat([outputs_fx, outputs_bx],2)
        
        if(verbose):
            print("================ BiLSTM ================")
            print()
            
            for name, p in self.named_parameters():
                print(name)
                print(p)
                print()
            
            print("Forget Gate Forward:")
            print(torch.stack(p_ft))

            print("\nForget Gate Backward:")
            print(torch.stack(pb_ft))

            print("\nIpnut Gate Forward:")
            print(torch.stack(p_it))

            print("\nInput Gate Backward:")
            print(torch.stack(pb_it))

            print("\nOutput Gate Forward:")
            print(torch.stack(p_ot))

            print("\nOutput Gate Backward:")
            print(torch.stack(pb_ot))

            print("\nCell Gate Forward:")
            print(torch.stack(p_gt))

            print("\nCell Gate Backward:")
            print(torch.stack(pb_gt))

            print("\nCell Output Forward:")
            print(torch.stack(p_f))

            print("\nCell Output Backward:")
            print(torch.stack(p_b))

            print("\nHidden State Forward:")
            print(torch.stack(p2_f))

            print("\nHidden State Backward:")
            print(torch.stack(p2_b))
            
            print("\nConcatened hidden State:")
            print(outputs)
        
        return outputs

In [9]:
class AttentionGate(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.Q = nn.Linear(hidden_size*2, input_size)
        self.K = nn.Linear(input_size, input_size)
        self.V = nn.Linear(input_size, input_size)
        
        for name, p in self.named_parameters():
            nn.init.normal_(p)
    
    def forward(self, q, k, v, verbose):
        q = self.Q(q)
        k = self.K(k)
        v = self.V(v)
        
        # matmul q k
        q_k = torch.bmm(q,k.transpose(1,2))/torch.sqrt(torch.Tensor([self.input_size]).to(device)) # :39
        q_k_softmax = F.softmax(q_k, dim=-1)
        
        # matmul o v
        o = torch.bmm(q_k_softmax, v)
        
        if verbose:
            print('=================== Attention ===================\n')
            for name, p in self.named_parameters():
                print(name)
                print(p)
                print()
                
            print('Query')
            print(q)
            print()
            
            print('Key')
            print(k)
            print()
            
            print('Key')
            print(v)
            print()
            
            print('softmax Q K')
            print(q_k_softmax)
            print()
            
            print('O V')
            print(o)
            print()
            
        
        return o

class MultiheadAttentionGate(nn.Module):
    def __init__(self, input_size, hidden_size, n_head, ner_nums):
        """
            input size is output_expert_size
        """
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_head = n_head
        
        self.attentions = nn.ModuleList(AttentionGate(input_size*ner_nums, hidden_size) for _ in range(n_head))
        
        # output MoEE tidak harus sesuai dengan jumlah tag
        self.classifiers = nn.Linear(input_size*n_head*ner_nums, ner_nums)
        
        for name, p in self.named_parameters():
            nn.init.normal_(p)
        
    def forward(self, x, bi_lstm_o, verbose):
        o_a = []
        
        for i, att in enumerate(self.attentions):
            if i == 0: o = att(bi_lstm_o,x,x,verbose)
            else: o = att(bi_lstm_o,x,x,False)
            
            o_a.append(o)
            
        concatened_o = torch.cat(o_a,2)
        outputs = self.classifiers(concatened_o)
        
        if(verbose):
            print('================ Multihead Attention ================')
            print()
            
            for name, p in self.named_parameters():
                print(name)
                print(p)
                print()
            
            print("cat head")
            print(concatened_o)
            print()
            
            print("outputs")
            print(outputs)
            print()
        
        return outputs

In [10]:
class MoEE(nn.Module):
    def __init__(self, ner_tags, hidden_size, expert_output_size, n_heads):
        super().__init__()
        
        self.experts = nn.ModuleList([EntityExpert(hidden_size, expert_output_size) for _ in range(len(ner_tags))])
        
        # gate dilewatkan ke dropout terlebih dahulu
        
        self.gate = MultiheadAttentionGate(expert_output_size, hidden_size, n_heads, len(ner_tags))
    
    def forward(self, x, verbose):
        e_o = []
        
        for e in self.experts:
            e_output = e(x)
            e_o.append(e_output)
            
        outputs = torch.cat(e_o,2)   
        
        outputs = self.gate(outputs,x,verbose)
        
        return outputs

class EntityExpert(nn.Module):
    def __init__(self, hidden_state_size, output_size):
        super().__init__()
        
        self.entity_expert = nn.Linear(2*hidden_state_size, output_size)
        
        for name, p in self.named_parameters():
            nn.init.normal_(p)
    
    def forward(self, x):
        output = self.entity_expert(x)
        return output

In [200]:
class CRF(nn.Module):
    def __init__(self, num_labels):
        super(CRF, self).__init__()
        self.num_labels = num_labels

        # Transisi dari label i ke label j (transisi[i, j] adalah transisi dari i ke j)
        self.transitions = nn.Parameter(torch.zeros(num_labels, num_labels))

    def forward(self, emissions, tags):
        batch_size, sentence_length, _ = emissions.size()

        # Compute the unary score
        unary_score = emissions.gather(2, tags.unsqueeze(2)).squeeze(2).sum(dim=1)

        # Compute the transition score
        transition_score = torch.zeros(batch_size)
        for i in range(sentence_length - 1):
            transition_score += self.transitions[tags[:, i], tags[:, i + 1]]
            
        # Sum of unary and transition scores
        total_score = unary_score + transition_score
        
        # Compute the partition function (Z)
        alpha = self.compute_alpha(emissions)
        log_partition = alpha[:, -1, :].logsumexp(dim=1).sum()

        # Compute the log likelihood
        loss = log_partition - total_score.sum()

        return loss

    def compute_alpha(self, emissions):
        batch_size, sentence_length, _ = emissions.size()
        alpha = torch.zeros(batch_size, sentence_length, self.num_labels)

        # Initialization
        alpha[:, 0, :] = emissions[:, 0, :]

        # Recursion
        for t in range(1, sentence_length):
            alpha[:, t, :] = emissions[:, t, :] + torch.logsumexp(alpha[:, t - 1, :] + self.transitions, dim=1)

        return alpha

    def viterbi_decode(self, emissions):
        batch_size, sentence_length, _ = emissions.size()

        # Initialize Viterbi variables
        delta = torch.zeros(batch_size, sentence_length, self.num_labels)
        backpointer = torch.zeros(batch_size, sentence_length, self.num_labels, dtype=torch.long)

        # Initialization
        delta[:, 0, :] = emissions[:, 0, :]

        # Recursion
        for t in range(1, sentence_length):
            trans_score = delta[:, t - 1, :].unsqueeze(2) + self.transitions
            max_scores, backpointer[:, t, :] = trans_score.max(dim=1)
            print(trans_score.max(dim=1))
            delta[:, t, :] = emissions[:, t, :] + max_scores

        # Termination
        best_last_tag = delta[:, -1, :].argmax(dim=1)

        # Backtrack to find best path using backpointers
        best_path = torch.zeros(batch_size, sentence_length, dtype=torch.long)
        best_path[:, -1] = best_last_tag

        for t in range(sentence_length - 2, -1, -1):
            best_path[:, t] = backpointer[:, t + 1, best_path[:, t + 1]]

        return best_path

In [201]:
class NERMOEE(nn.Module):
    def __init__(self, embedding_size, hidden_size, ner_tags, expert_output_size, n_heads):
        super().__init__()
        
        torch.manual_seed(34)
        
        self.encoder = BiLSTM(embedding_size, hidden_size)
        self.MoEE = MoEE(ner_tags, hidden_size, expert_output_size, n_heads)
        self.CRF = CRF(len(ner_tags))
    
    def forward(self, x, verbose):
        o = self.encoder(x,verbose)
        o = self.MoEE(o,verbose)
        o = self.CRF(o,torch.Tensor([t_labels]).to(torch.int64))
        
        return o
    
    def predict(self, x):
        o = self.encoder(x,False)
        o = self.MoEE(o,False)
        o = self.CRF.viterbi_decode(o)
        
        return o
    
    def update_model_parameters(self, x:torch.Tensor):
        self.encoder.reset_states()
        x.backward()
    
    def verbose(self):
        self.encoder.get_params()

In [202]:
model = NERMOEE(embedding_size, hidden_size, ner_tags, expert_output_size, n_heads)

In [186]:
outputs = model(data,False)

In [160]:
t_labels

[38, 38, 38, 15]

In [203]:
model.predict(data)

torch.return_types.max(
values=tensor([[94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746,
         94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746,
         94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746,
         94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746,
         94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746, 94.0746]],
       grad_fn=<MaxBackward0>),
indices=tensor([[25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
         25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
         25, 25, 25]]))
torch.return_types.max(
values=tensor([[188.3020, 188.3020, 188.3020, 188.3020, 188.3020, 188.3020, 188.3020,
         188.3020, 188.3020, 188.3020, 188.3020, 188.3020, 188.3020, 188.3020,
         188.3020, 188.3020, 188.3020, 188.3020, 188.3020, 188.3020, 188.3020,
         188.3020, 188.3020, 188.3020, 188.3020, 188.3020, 188.3020, 188.3020

tensor([[25, 25, 25, 25]])

In [162]:
# optimizer = torch.optim.SGD(model.parameters(),lr=0.000001)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [163]:
model(data,False)

tensor(430.4365, grad_fn=<SubBackward0>)

In [164]:
for i in range(125):
    outputs = model(data,False)
    print(outputs)
    print()
    optimizer.zero_grad()
    model.update_model_parameters(outputs)
    optimizer.step()

tensor(430.4365, grad_fn=<SubBackward0>)

tensor(410.2005, grad_fn=<SubBackward0>)

tensor(389.9785, grad_fn=<SubBackward0>)

tensor(369.7679, grad_fn=<SubBackward0>)

tensor(349.5758, grad_fn=<SubBackward0>)

tensor(329.4908, grad_fn=<SubBackward0>)

tensor(310.5508, grad_fn=<SubBackward0>)

tensor(296.3021, grad_fn=<SubBackward0>)

tensor(284.0937, grad_fn=<SubBackward0>)

tensor(271.1312, grad_fn=<SubBackward0>)

tensor(257.3369, grad_fn=<SubBackward0>)

tensor(242.8905, grad_fn=<SubBackward0>)

tensor(227.9418, grad_fn=<SubBackward0>)

tensor(212.6085, grad_fn=<SubBackward0>)

tensor(197.0051, grad_fn=<SubBackward0>)

tensor(181.7703, grad_fn=<SubBackward0>)

tensor(175.0717, grad_fn=<SubBackward0>)

tensor(175.6732, grad_fn=<SubBackward0>)

tensor(176.1969, grad_fn=<SubBackward0>)

tensor(176.3059, grad_fn=<SubBackward0>)

tensor(176.0400, grad_fn=<SubBackward0>)

tensor(175.4413, grad_fn=<SubBackward0>)

tensor(174.5467, grad_fn=<SubBackward0>)

tensor(173.3881, grad_fn=<SubBackw