<a href="https://colab.research.google.com/github/coll-j/IndonesianDepParse/blob/master/gegem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pack_sequence, PackedSequence

In [23]:
input1 = torch.randn(2,5,128)
input2 = torch.randn(2,5,128)
weight = nn.Parameter(torch.Tensor(1, 128, 128))
s = torch.einsum('bxi,oij,byj->boxy', input1, weight, input2)
print(s.shape)

torch.Size([2, 1, 5, 5])


In [3]:
%%bash
wget https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu
wget https://raw.githubusercontent.com/coll-j/IndonesianDepParse/master/test.txt
wget https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-train.conllu
wget https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-test.conllu
wget https://raw.githubusercontent.com/coll-j/IndonesianDepParse/master/test1.txt

--2020-09-06 04:30:24--  https://raw.githubusercontent.com/UniversalDependencies/UD_Indonesian-GSD/master/id_gsd-ud-dev.conllu
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 979367 (956K) [text/plain]
Saving to: ‘id_gsd-ud-dev.conllu’

     0K .......... .......... .......... .......... ..........  5% 19.8M 0s
    50K .......... .......... .......... .......... .......... 10% 19.4M 0s
   100K .......... .......... .......... .......... .......... 15% 24.2M 0s
   150K .......... .......... .......... .......... .......... 20% 25.0M 0s
   200K .......... .......... .......... .......... .......... 26% 16.1M 0s
   250K .......... .......... .......... .......... .......... 31% 33.5M 0s
   300K .......... .......... .......... .......... .......... 36% 33.8M 0s
   3

In [2]:
args = {
    'batch_size': 512, # 512
    'word_emb_dim': 64, # 64
    'pos_emb_dim': 30, # 32
    'lemma_emb_dim': 64, # 64
    'hidden_dim': 32,
    'num_layers': 16,
    'dropout': 0.33,
    'ul_hidden_dim': 64, # 64
    'dep_hidden_dim': 32,
    'learning_rate': 0.001,
    'clip': 0.25,
    'lr_scheduler_step_size': 30, # 30
    'lr_gamma': 0.2
}
FIELD_TO_IDX = {'idx': 0, 'word': 1, 'lemma': 2, 'postag': 3, 'head': 6, 'deprel': 7}
# FIELD_TO_IDX = {'idx': 0, 'word': 1, 'postag': 2, 'head': 3, 'deprel': 4}

ROOT_ID = 1

In [3]:
from sklearn import preprocessing
def get_long_tensor(tokens_list, batch_size, vocabs=None):
    """ Convert (list of )+ tokens to a padded LongTensor. """
    sizes = []
    x = tokens_list
    while isinstance(x[0], list):
        sizes.append(max(len(y) for y in x))
        x = [z for y in x for z in y]
    tokens = torch.LongTensor(batch_size, *sizes).fill_(0)
    for i, s in enumerate(tokens_list):
        tokens[i, :len(s)] = torch.LongTensor(s)
    return tokens

In [4]:
import random
class Loader:
  
  def __init__(self, file_path, batch_size, eval=False):
    self._file_path = file_path
    self.batch_size = batch_size
    self.eval = eval

    self.data = self.load_file()
    self.vocabs = self.get_vocabs(self.data)
    self.data = self.preprocess(self.data, self.vocabs)
    if not self.eval:
      random.shuffle(self.data)
    self.data = self.chunk_batches(self.data)

  def load_file(self):
    sents, sent = [], []
    f = open(self._file_path)
    for line in f:
      line = line.strip()
      if len(line) == 0:
        if len(sent) > 0:
          sents.append(sent)
          sent = []
      else:
        if line.startswith('#'):
          continue
        cols = line.split('\t')
        sent += [cols]
    if len(sent) > 0:
      sents.append(sent)

    return sents

  def get_vocabs(self, data):
    print(len(data))
    words = [cols[FIELD_TO_IDX['word']].lower() for sent in data for cols in sent]
    lemmas = [cols[FIELD_TO_IDX['lemma']].lower() for sent in data for cols in sent]
    postags = [cols[FIELD_TO_IDX['postag']] for sent in data for cols in sent] 
    deprels = [cols[FIELD_TO_IDX['deprel']] for sent in data for cols in sent] 

    words = self.build_vocab(words, start_idx=1)
    lemmas = self.build_vocab(lemmas)
    postags = self.build_vocab(postags)
    deprels = self.build_vocab(deprels)

    return {'words': words, 'lemmas': lemmas, 'postags': postags, 'deprels': deprels}

  def build_vocab(self, cols, start_idx=0):
    w2i = {}
    for w in cols:
      if w not in w2i:
        w2i[w] = len(w2i) + start_idx
    
    return w2i

  def preprocess(self, data, vocabs):
    processed = []
    for sent in data:
      p_sent = [[ROOT_ID] + [vocabs['words'][w[FIELD_TO_IDX['word']].lower()] for w in sent]]
      p_sent += [[ROOT_ID] + [vocabs['lemmas'][w[FIELD_TO_IDX['lemma']].lower()] for w in sent]]
      p_sent += [[ROOT_ID] + [vocabs['postags'][w[FIELD_TO_IDX['postag']]] for w in sent]]
      p_sent += [[int(w[FIELD_TO_IDX['head']]) for w in sent]]
      p_sent += [[vocabs['deprels'][w[FIELD_TO_IDX['deprel']]] for w in sent]]
      processed.append(p_sent)

    return processed

  def reshuffle(self):
    data = [y for x in self.data for y in x]
    self.data = self.chunk_batches(data)
    random.shuffle(self.data)

  def chunk_batches(self, data):
    res, curr = [], []
    currlen = 0
    for sent in data:
      if len(sent[0]) + currlen > self.batch_size:
        if len(curr) > 0:
          res.append(curr)
          curr = []
          currlen = 0
      
      curr.append(sent)
      currlen += len(sent[0])
    if len(curr) > 0:
      res.append(curr)

    return res
    
  def __getitem__(self, key):
    batch = self.data[key]
    batch_size = len(batch)
    batch = list(zip(*batch))
    
    # convert to tensors
    words = batch[0]
    words = get_long_tensor(words, batch_size, vocabs=self.vocabs['words'])
    words_mask = torch.eq(words, 0)

    lemmas = batch[1]
    lemmas = get_long_tensor(lemmas, batch_size, vocabs=self.vocabs['lemmas'])
    
    postags = batch[2]
    postags = get_long_tensor(postags, batch_size, vocabs=self.vocabs['postags'])
    
    heads = batch[3]
    heads = get_long_tensor(heads, batch_size)
    
    deprels = batch[4]
    deprels = get_long_tensor(deprels, batch_size, vocabs=self.vocabs['deprels'])
    sentlens = [len(sent) for sent in batch[0]]

    return words, words_mask, lemmas, postags, heads, deprels, sentlens

  def __iter__(self):
    for i in range(len(self.data)):
      yield self.__getitem__(i)

  def __len__(self):
    return len(self.data)

In [5]:
# data = Loader('test.txt', args['batch_size'])
eval_data = Loader('id_gsd-ud-dev.conllu', args['batch_size'])
train_data = Loader('id_gsd-ud-train.conllu', args['batch_size'], eval=True)
# train_data = Loader('test1.txt', args['batch_size'])

559
4477


# Build Model

In [6]:
class DeepBiaffine(nn.Module):
  def __init__(self, input1_size, input2_size, hidden_size, output_size, dropout=0.0):
    super(DeepBiaffine, self).__init__()
    # Simple MLP
    self.MLP1 = nn.Linear(input1_size, hidden_size)
    self.MLP2 = nn.Linear(input2_size, hidden_size)
    self.relu = F.leaky_relu
    # Biaffine
    # self.biaff = nn.Bilinear(hidden_size, hidden_size, output_size)
    self.weight = nn.Parameter(torch.randn(output_size, hidden_size, hidden_size))
    self.drop = nn.Dropout(dropout)

  def forward(self, input1, input2):
    output1 = self.drop(self.relu(self.MLP1(input1)))
    output2 = self.drop(self.relu(self.MLP2(input2)))

    s = torch.einsum('bxi,oij,byj->boxy', output1, self.weight, output2)
    s = s.squeeze(1)
    return s
    # return self.biaff(output1, output2)

In [7]:
class Model(nn.Module):
  def __init__(self, args, vocab):
    super(Model, self).__init__()

    self.args = args
    self.vocab = vocab

    # input layer
    input_size = 0
    self.word_emb = nn.Embedding(len(vocab['words']) + 1, self.args['word_emb_dim'])
    input_size += self.args['word_emb_dim']
    self.pos_emb = nn.Embedding(len(vocab['postags']) + 1, self.args['pos_emb_dim'])
    input_size += self.args['pos_emb_dim']
    self.lemma_emb = nn.Embedding(len(vocab['lemmas']) + 1, self.args['lemma_emb_dim'])
    input_size += self.args['lemma_emb_dim']

    # recurrent layer
    self.GRU = nn.GRU(input_size, self.args['hidden_dim'], self.args['num_layers'],\
                            batch_first=True, dropout=self.args['dropout'], bidirectional=True)
    self.GRU_hidden = nn.Parameter(torch.randn(self.args['num_layers'] * 2, 1, self.args['hidden_dim']))
    
    # classifier
    self.unlabeled = DeepBiaffine(self.args['hidden_dim'] * 2, self.args['hidden_dim'] * 2, self.args['ul_hidden_dim'], 1, dropout=self.args['dropout'])
    self.deprel = DeepBiaffine(self.args['hidden_dim'] * 2, self.args['hidden_dim'] * 2, self.args['dep_hidden_dim'], len(vocab['deprels']), dropout=self.args['dropout'])

    # criterion
    self.crit = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum')
    
    self.dropout = nn.Dropout(self.args['dropout'])
  def forward(self, words, word_mask, lemmas, postags, heads, deprels, sentlens):
    words = words.cuda()
    word_mask = word_mask.cuda()
    lemmas = lemmas.cuda()
    postags = postags.cuda()
    heads = heads.cuda()
    deprels = deprels.cuda()

    batch_size = words.size(0)
    inputs = []

    # pack embedded inputs
    embedded_word = self.dropout(self.word_emb(words))
    embedded_pos = self.dropout(self.pos_emb(postags))
    embedded_lemma = self.dropout(self.lemma_emb(lemmas))

    # rnn inputs
    rnn_inputs = torch.cat((embedded_word, embedded_pos, embedded_lemma), -1)
    rnn_inputs = pack_padded_sequence(rnn_inputs, sentlens, batch_first=True, enforce_sorted=False)

    rnn_outputs, h = self.GRU(rnn_inputs, self.GRU_hidden.expand(self.args['num_layers'] * 2, batch_size, self.args['hidden_dim']).contiguous())

    rnn_outputs, _ = pad_packed_sequence(rnn_outputs, batch_first=True)

    unlabeled_scores = self.unlabeled(self.dropout(rnn_outputs), self.dropout(rnn_outputs))
    deprel_scores = self.deprel(self.dropout(rnn_outputs), self.dropout(rnn_outputs)).permute(0, 2, 3, 1)
    
    unlabeled_scores = unlabeled_scores[:, 1:, :] # exclude root
    unlabeled_scores = unlabeled_scores.masked_fill(word_mask.unsqueeze(1), -float('inf'))
    
    deprel_scores = deprel_scores[:, 1:] # exclude rooot
    deprel_scores = torch.gather(deprel_scores, 2, heads.unsqueeze(2).unsqueeze(3).expand(-1, -1, -1, len(self.vocab['deprels']))).view(-1, len(self.vocab['deprels']))

    preds = []
    if self.training:
      unlabeled_target = heads.masked_fill(word_mask[:, 1:], -1)
      deprel_target = deprels.masked_fill(word_mask[:, 1:], -1)

      loss = self.crit(unlabeled_scores.contiguous().view(-1, unlabeled_scores.size(2)), unlabeled_target.view(-1))
      loss += self.crit(deprel_scores.contiguous(), deprel_target.view(-1))

      loss /= sum(sentlens) #divided longest seq
      
      # calculate accuracy v1.0
      unlabeled_preds = F.log_softmax(unlabeled_scores, 2)
      unlabeled_preds = unlabeled_preds.argmax(dim=2)
      unlabeled_preds = unlabeled_preds.masked_fill(word_mask[:, 1:], -1)
      ul_corr = (unlabeled_preds.view(-1) == unlabeled_target.view(-1))
      acc1 = ul_corr.sum().float() / float( unlabeled_target.view(-1).size(0) )

      deprel_preds = F.log_softmax(deprel_scores, 1) # coba ga pake
      deprel_preds = deprel_scores.argmax(dim=1).view(batch_size, -1)
      deprel_preds = deprel_preds.masked_fill(word_mask[:, 1:], -1)
      dep_corr = (deprel_preds.view(-1) == deprel_target.view(-1))
      acc2 = dep_corr.sum().float() / float( deprel_target.view(-1).size(0) )


      acc = (acc1 + acc2)/2
    else:
      unlabeled_target = heads.masked_fill(word_mask[:, 1:], -1)
      deprel_target = deprels.masked_fill(word_mask[:, 1:], -1)

      loss = self.crit(unlabeled_scores.contiguous().view(-1, unlabeled_scores.size(2)), unlabeled_target.view(-1))
      loss += self.crit(deprel_scores.contiguous(), deprel_target.view(-1))
      
      loss /= sum(sentlens) #divided longest seq
      
      # calculate accuracy v1.0
      unlabeled_preds = F.log_softmax(unlabeled_scores, 2)
      unlabeled_preds = unlabeled_preds.argmax(dim=2)
      unlabeled_preds = unlabeled_preds.masked_fill(word_mask[:, 1:], -1)
      ul_corr = (unlabeled_preds.view(-1) == unlabeled_target.view(-1))
      acc1 = ul_corr.sum().float() / float( unlabeled_target.view(-1).size(0) )

      deprel_preds = F.log_softmax(deprel_scores, 1) # coba ga pake
      deprel_preds = deprel_scores.argmax(dim=1).view(batch_size, -1)
      deprel_preds = deprel_preds.masked_fill(word_mask[:, 1:], -1)
      dep_corr = (deprel_preds.view(-1) == deprel_target.view(-1))
      acc2 = dep_corr.sum().float() / float( deprel_target.view(-1).size(0) )


      acc = (acc1 + acc2)/2
      # predictions
      # unlabeled_preds = F.log_softmax(unlabeled_scores, 1)
      # unlabeled_preds = unlabeled_preds.argmax(dim=1)

      # deprel_preds = F.log_softmax(deprel_scores, 1)
      # deprel_preds = deprel_scores.argmax(dim=1)

      preds.append(unlabeled_preds.detach().cpu().numpy())
      preds.append(deprel_preds.detach().cpu().numpy())

    # del unlabeled_mask, unlabeled_scores, words, word_mask, postags, heads, deprels, deprel_scores, unlabeled_preds, deprel_preds, ul_corr, dep_corr, acc1, acc2
    # del rnn_inputs, rnn_outputs, h, inputs, sentlens, unlabeled_inputs
    torch.cuda.empty_cache()

    return loss, acc, preds

In [8]:
model = Model(args, train_data.vocabs)
model.cuda()

Model(
  (word_emb): Embedding(17263, 64)
  (pos_emb): Embedding(17, 30)
  (lemma_emb): Embedding(16430, 64)
  (GRU): GRU(158, 32, num_layers=16, batch_first=True, dropout=0.33, bidirectional=True)
  (unlabeled): DeepBiaffine(
    (MLP1): Linear(in_features=64, out_features=64, bias=True)
    (MLP2): Linear(in_features=64, out_features=64, bias=True)
    (drop): Dropout(p=0.33, inplace=False)
  )
  (deprel): DeepBiaffine(
    (MLP1): Linear(in_features=64, out_features=32, bias=True)
    (MLP2): Linear(in_features=64, out_features=32, bias=True)
    (drop): Dropout(p=0.33, inplace=False)
  )
  (crit): CrossEntropyLoss()
  (dropout): Dropout(p=0.33, inplace=False)
)



# Train Function

In [83]:
import matplotlib.pyplot as plt
%matplotlib inline
def plot_grad_flow(named_parameters):
    ave_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
          if p.grad is not None:
            # print(p.grad.abs().mean())
            layers.append(n)
            ave_grads.append(p.grad.abs().mean())
    # plt.figure(figsize=(10,8))
    plt.plot(ave_grads, alpha=0.3, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(xmin=0, xmax=len(ave_grads))
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)
    xmin, xmax, ymin, ymax = plt.axis()
    # print("ymin ", ymin)
    # plt.show()

In [9]:
import time
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
def train_model(args, model, train_data, eval_data=None, num_epochs=1, saved_name="saved_model.pt"):
  optimizer = optim.Adam(model.parameters(), lr=args['learning_rate'])
  # scheduler = StepLR(optimizer, step_size=args['lr_scheduler_step_size'], gamma=args['lr_gamma'])
  train_loss_list = []
  train_acc_list = []
  val_loss_list = []
  val_acc_list = []
  print('start training...')
  start_time = time.time()
  best_valid_loss = float('inf')
  for epoch in range(num_epochs):
    start_time = time.time()
    # train
    train_loss = train_acc = 0
    with torch.enable_grad():
      model.train()
      for i, batch in enumerate(train_data):
        # model.zero_grad()
        optimizer.zero_grad()

        words, word_mask, lemmas, postags, heads, deprels, sentlens = batch
        loss, acc, _ = model(words, word_mask, lemmas, postags, heads, deprels, sentlens)
        train_loss += loss.item()
        train_acc += acc.item()
        # del acc
        # del words, word_mask, postags, heads, deprels, sentlens
        # del _
        loss.backward()
        # plot_grad_flow(model.named_parameters())

        torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip'])
        optimizer.step()
        # del loss
        torch.cuda.empty_cache()

      elapsed_time = time.time() - start_time
      train_loss /= len(train_data)
      train_acc /= len(train_data)

    # eval
    eval_loss = eval_acc = 0
    if eval_data is not None:
      model.eval()
      with torch.no_grad():
        for i, batch in enumerate(eval_data):
          
          words, word_mask, lemmas, postags, heads, deprels, sentlens = batch
          loss, acc, _ = model(words, word_mask, lemmas, postags, heads, deprels, sentlens)
          eval_loss += loss.item()
          eval_acc += acc.item()

          torch.cuda.empty_cache()

        eval_loss /= len(eval_data)
        eval_acc /= len(eval_data)
    else:
      eval_loss = 0
      eval_acc = 0
    log = '|  {}/{} epoch  |  train_loss:{:.5f} | train_acc:{:2.2f} |  eval_loss:{:.5f} | eval_acc:{:2.2f} | time: {:.2f}  |'.format(
        epoch + 1, num_epochs, train_loss, train_acc * 100, eval_loss, eval_acc * 100, elapsed_time
    )

    # scheduler.step()
    train_data.reshuffle()
    print(log)
    if best_valid_loss > eval_loss and eval_loss != 0:
      torch.save(model.state_dict(), saved_name)
      best_valid_loss = eval_loss
      print("Save model")

  return

In [None]:
train_model(args, model, train_data, eval_data, num_epochs=30)
# train_model(args, model, train_data, num_epochs=30)

start training...
|  1/30 epoch  |  train_loss:5.63846 | train_acc:66.59 |  eval_loss:6.14070 | eval_acc:66.37 | time: 29.93  |
Save model
|  2/30 epoch  |  train_loss:5.15827 | train_acc:68.17 |  eval_loss:6.34891 | eval_acc:66.22 | time: 29.18  |
|  3/30 epoch  |  train_loss:4.91265 | train_acc:69.19 |  eval_loss:6.51920 | eval_acc:65.77 | time: 30.57  |
|  4/30 epoch  |  train_loss:4.52654 | train_acc:71.80 |  eval_loss:7.48101 | eval_acc:63.78 | time: 29.37  |
|  5/30 epoch  |  train_loss:4.09213 | train_acc:74.45 |  eval_loss:8.87441 | eval_acc:63.16 | time: 29.60  |
|  6/30 epoch  |  train_loss:3.75580 | train_acc:76.28 |  eval_loss:10.27306 | eval_acc:63.15 | time: 29.22  |
|  7/30 epoch  |  train_loss:3.48144 | train_acc:78.07 |  eval_loss:10.26734 | eval_acc:63.37 | time: 29.41  |
|  8/30 epoch  |  train_loss:3.21657 | train_acc:79.86 |  eval_loss:10.29980 | eval_acc:63.46 | time: 30.12  |
|  9/30 epoch  |  train_loss:3.03528 | train_acc:81.26 |  eval_loss:10.46266 | eval_acc: