<a href="https://colab.research.google.com/github/coll-j/IndonesianDepParse/blob/master/gegem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [1]:
%%bash
wget https://github.com/coll-j/IndonesianDepParse/blob/master/test.txt
pip install nlp-id

Collecting nlp-id
  Downloading https://files.pythonhosted.org/packages/08/10/afe2f49703d27600292130b5308139018231ae0b204ca12c33907f8294bb/nlp_id-0.1.9.8.tar.gz (7.5MB)
Collecting scikit-learn==0.22
  Downloading https://files.pythonhosted.org/packages/2e/d0/860c4f6a7027e00acff373d9f5327f4ae3ed5872234b3cbdd7bcb52e5eff/scikit_learn-0.22-cp36-cp36m-manylinux1_x86_64.whl (7.0MB)
Collecting nltk==3.4.5
  Downloading https://files.pythonhosted.org/packages/f6/1d/d925cfb4f324ede997f6d47bea4d9babba51b49e87a767c170b77005889d/nltk-3.4.5.zip (1.5MB)
Collecting wget==3.2
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: nlp-id, nltk, wget
  Building wheel for nlp-id (setup.py): started
  Building wheel for nlp-id (setup.py): finished with status 'done'
  Created wheel for nlp-id: filename=nlp_id-0.1.9.8-cp36-none-any.whl size=7723018 sha256=7f66d4f55cc59e6dce7bf18c144c142ebd

--2020-08-23 13:08:01--  https://github.com/coll-j/IndonesianDepParse/blob/master/test.txt
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘test.txt’

     0K .......... .......... .......... .......... ..........  920K
    50K .......... .......... .........                        1.37M=0.08s

2020-08-23 13:08:02 (1.03 MB/s) - ‘test.txt’ saved [81761]



In [None]:
import nltk
nltk.download('punkt')

from nlp_id.tokenizer import Tokenizer
from nlp_id.postag import PosTag

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
FIELD_TO_IDX = {'idx': 0, 'word': 1, 'postag': 2, 'head': 3, 'deprel': 4}
def get_long_tensor(tokens_list, batch_size):
    """ Convert (list of )+ tokens to a padded LongTensor. """
    sizes = []
    x = tokens_list
    size = (max(len(y) for y in x))
    tokens = torch.LongTensor(batch_size, size).fill_(0)
    for i, s in enumerate(tokens_list):
      ten = torch.LongTensor(list(map(ord, s[:len(s)])))
      tokens[i, :len(s)] = ten
    return tokens

In [3]:
class Loader:
  
  def __init__(self, file_path, batch_size):
    self._file_path = file_path
    self.batch_size = batch_size

    self.data = self.load_file()
    self.vocabs = self.get_vocabs(self.data)
    self.data = self.chunk_batches(self.data)

  def load_file(self):
    sents, sent = [], []
    f = open(self._file_path)
    for line in f:
      line = line.strip()
      if len(line) == 0:
        if len(sent) > 0:
          sents.append(sent)
          sent = []

      else:
        cols = line.split('\t')
        sent += [cols]
    if len(sent) > 0:
      sents.append(sent)

    return sents

  def get_vocabs(self, data):
    words = [cols[FIELD_TO_IDX['word']] for sent in data for cols in sent]
    postags = [cols[FIELD_TO_IDX['postag']] for sent in data for cols in sent] 
    deprels = [cols[FIELD_TO_IDX['deprel']] for sent in data for cols in sent] 

    words = self.build_vocab(words)
    postags = self.build_vocab(postags)
    deprels = self.build_vocab(deprels)

    return {'words': words, 'postags': postags, 'deprels': deprels}

  def build_vocab(self, cols):
    w2i = {}
    for w in cols:
      if w not in w2i:
        w2i[w] = len(w2i)
    
    return w2i

  def chunk_batches(self, data):
    # TO DO: Chunk data into batches
    res, curr = [], []
    currlen = 0
    for sent in data:
      if len(sent[0]) + currlen > self.batch_size:
        if len(curr) > 0:
          res.append(curr)
          curr = []
          currlen = 0
      
      curr.append(sent)
      currlen += len(sent[0])
    if len(curr) > 0:
      res.append(curr)

    return res
    
  def get_batches(self, key):
    print('data[i]', self.data[key])
    batch = self.data[key]
    # convert to tensors
    words = [cols[FIELD_TO_IDX['word']] for sent in batch for cols in sent]
    words = get_long_tensor(words, len(words))
    words_mask = torch.eq(words, 0)

    postags = [cols[FIELD_TO_IDX['postag']] for sent in batch for cols in sent]
    postags = get_long_tensor(postags, len(postags))
    heads = [cols[FIELD_TO_IDX['head']] for sent in batch for cols in sent]
    heads = get_long_tensor(heads, len(heads))
    deprels = [cols[FIELD_TO_IDX['deprel']] for sent in batch for cols in sent]
    deprels = get_long_tensor(deprels, len(deprels))
    sentlens = [len(y) for y in batch]
    print('sentlens', sentlens)

    return words, words_mask, postags, heads, deprels, sentlens

  def __iter__(self):
    for i in range(len(self.data)):
      yield self.get_batches(i)

In [None]:
data = Loader('test.txt', 11)
for d in data:
  print(d)

data[i] [[['1', 'Banyak', 'NUM', '2', 'nummod'], ['2', 'buku', 'NN', '5', 'obj'], ['3', 'telah', 'ADV', '5', 'advmod'], ['4', 'saya', 'PR', '5', 'nsubj'], ['5', 'baca', 'VB', '0', 'root']], [['1', 'Buku', 'NN', '4', 'nsubj'], ['2', 'ini', 'PR', '1', 'nmod'], ['3', 'telah', 'ADV', '4', 'advmod'], ['4', 'dibaca', 'VB', '0', 'root']]]
sentlens [5, 4]
(tensor([[ 66,  97, 110, 121,  97, 107],
        [ 98, 117, 107, 117,   0,   0],
        [116, 101, 108,  97, 104,   0],
        [115,  97, 121,  97,   0,   0],
        [ 98,  97,  99,  97,   0,   0],
        [ 66, 117, 107, 117,   0,   0],
        [105, 110, 105,   0,   0,   0],
        [116, 101, 108,  97, 104,   0],
        [100, 105,  98,  97,  99,  97]]), tensor([[False, False, False, False, False, False],
        [False, False, False, False,  True,  True],
        [False, False, False, False, False,  True],
        [False, False, False, False,  True,  True],
        [False, False, False, False,  True,  True],
        [False, False, Fals

In [None]:
data.vocabs

{'deprels': {'advmod': 2,
  'appos': 6,
  'nmod': 5,
  'nsubj': 3,
  'nummod': 0,
  'obj': 1,
  'punct': 7,
  'root': 4},
 'postags': {'ADV': 2, 'IN': 5, 'NN': 1, 'NUM': 0, 'PR': 3, 'SYM': 6, 'VB': 4},
 'words': {'.': 14,
  'Banyak': 0,
  'Buku': 5,
  'Kita': 8,
  'baca': 4,
  'berada': 10,
  'buku': 1,
  'dalam': 12,
  'di': 11,
  'dibaca': 7,
  'ini': 6,
  'ruangan': 13,
  'saya': 3,
  'sedang': 9,
  'telah': 2}}

In [None]:
data.data

[[[['1', 'Banyak', 'NUM', '2', 'nummod'],
   ['2', 'buku', 'NN', '5', 'obj'],
   ['3', 'telah', 'ADV', '5', 'advmod'],
   ['4', 'saya', 'PR', '5', 'nsubj'],
   ['5', 'baca', 'VB', '0', 'root']],
  [['1', 'Buku', 'NN', '4', 'nsubj'],
   ['2', 'ini', 'PR', '1', 'nmod'],
   ['3', 'telah', 'ADV', '4', 'advmod'],
   ['4', 'dibaca', 'VB', '0', 'root']]],
 [[['1', 'Kita', 'PR', '3', 'nsubj'],
   ['2', 'sedang', 'ADV', '3', 'advmod'],
   ['3', 'berada', 'VB', '0', 'root'],
   ['4', 'di', 'IN', '6', 'appos'],
   ['5', 'dalam', 'NN', '6', 'advmod'],
   ['6', 'ruangan', 'NN', '3', 'obj'],
   ['7', '.', 'SYM', '3', 'punct']]]]

# Build Model

In [None]:
class Model(nn.Module):
  def __init__(self, args, vocab):
    super().__init__()

    self.args = args

    # input layer
    input_size = 0
    self.wordemb = nn.Embedding(len(vocab['words']), self.args['word_emb_dim'])
    input_size += self.args['word_emb_dim']
    self.posemb = nn.Embedding(len(vocab['postags']), self.args['pos_emb_dim'])
    input_size += self.args['pos_emb_dim']
    self.depemb = nn.Embedding(len(vocab['deprels']), self.args['dep_emb_dim'])

    # recurrent layer
    self.parserGRU = nn.GRU(input_size, self.args['hidden_dim'], self.args['num_layers'],\
                            batch_first=True, dropout=self.args['dropout'], bidirectional=True)
    self.GRU_h0 = nn.Parameter(torch.zeros(self.args['num_layers'] * 2),\
                               self.args['batch_size'], self.args['hidden_dim'])
    
    # classifier
    # TO DO: add MLP and Deep Biaffine for each head and deprel

    # criterion
    self.criterion = nn.CrossEntropyLoss()
    
    self.dropout = nn.Dropout(self.args['dropout'])
  def forward(self):
    #TODO: make forward func
    return