In [None]:
#@title Installation of Required Libraries
!pip install datasets
!pip install conllu
!pip install evaluate
!pip install transformers
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collec

In [None]:
#@title Imports  Required
import torch
import torch.nn as nn
from functools import partial
from datasets import load_dataset, Dataset

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import gc

In [None]:
#@title Arc Eager Model

class ArcEager:

  def __init__(self, sentence):
    self.sentence = sentence
    self.buffer = [i for i in range(len(self.sentence))]
    self.stack = []
    self.arcs = [-1 for _ in range(len(self.sentence))]

    # one shift move to initialize the stack
    self.shift()

  def shift(self):
    b1 = self.buffer[0]
    self.buffer = self.buffer[1:]
    self.stack.append(b1)

  def left_arc(self):
    o1 = self.stack.pop()
    o2 = self.buffer[0]
    self.arcs[o1] = o2


  def right_arc(self):
    o1 = self.buffer[0]
    self.buffer = self.buffer[1:]
    o2 = self.stack.pop()
    self.arcs[o1] = o2
    self.stack.append(o2)
    self.stack.append(o1)


  def reduce(self):
    o1 = self.stack.pop()

  def is_tree_final(self):
    return len(self.stack) == 1 and len(self.buffer) == 0

  def print_configuration(self):
    s = [self.sentence[i] for i in self.stack]
    b = [self.sentence[i] for i in self.buffer]
    print(s, b)
    print(self.arcs)

In [None]:
#@title Oracle

class Oracle:
  def __init__(self, parser, gold_tree):
    self.parser = parser
    self.gold = gold_tree

  def is_left_arc_gold(self):
    if len(self.parser.buffer) == 0:
      return False
    o1 = self.parser.stack[len(self.parser.stack)-1]
    o2 = self.parser.buffer[0]

    if self.gold[o1] == o2 and self.parser.arcs[o1] != self.gold[o1] and o1 != -1:
      return True
    return False


  def is_right_arc_gold(self):
    if len(self.parser.buffer) == 0:
      return False
    o1 = self.parser.stack[len(self.parser.stack)-1]
    o2 = self.parser.buffer[0]

    if self.gold[o2] != o1:
      return False

    return True

  def is_shift_gold(self):
    if len(self.parser.buffer) == 0:
      return False

    #This dictates transition precedence of the parser
    if (self.is_left_arc_gold() or self.is_right_arc_gold() or self.is_reduce_gold()):
      return False

    return True

  def is_reduce_gold(self):
    if len(self.parser.stack) < 2: return False
    o1 = self.parser.stack[-1]
    if self.has_head(o1) and self.has_all_children(o1):
      return True
    return False

  def has_head(self, node):
    if self.parser.arcs[node] != -1: return True
    else: return False

  def has_all_children(self, node):
    i = 0
    for arc in self.gold:
      if arc == node:
        if self.parser.arcs[i] != node:
          return False
      i+=1
    return True

In [None]:
#@title  functions  is_projective and dictionary

# the function returns whether a tree is projective or not. It is currently
# implemented inefficiently by brute checking every pair of arcs.
def is_projective(tree):
  for i in range(len(tree)):
    if tree[i] == -1:
      continue
    left = min(i, tree[i])
    right = max(i, tree[i])

    for j in range(0, left):
      if tree[j] > left and tree[j] < right:
        return False
    for j in range(left+1, right):
      if tree[j] < left or tree[j] > right:
        return False
    for j in range(right+1, len(tree)):
      if tree[j] > left and tree[j] < right:
        return False

  return True

# the function creates a dictionary of word/index pairs: our embeddings vocabulary
# threshold is the minimum number of appearance for a token to be included in the embedding list
def create_dict(dataset, threshold=3):
  dic = {}  # dictionary of word counts
  for sample in dataset:
    for word in sample['new_tokens']:
      if word in dic:
        dic[word] += 1
      else:
        dic[word] = 1

  map = {}  # dictionary of word/index pairs. This is our embedding list
  map["<pad>"] = 0
  map["<ROOT>"] = 1
  map["<unk>"] = 2 #used for words that do not appear in our list

  next_indx = 3
  for word in dic.keys():
    if dic[word] >= threshold:
      map[word] = next_indx
      next_indx += 1

  return map

In [None]:
#@title preapere batch and process sample

def prepare_batch(batch_data, get_gold_path=False, is_transformer=False):
  data = [process_sample(s, get_gold_path=get_gold_path, is_transformer = is_transformer) for s in batch_data]
  # sentences, paths, moves, trees are parallel arrays, each element refers to a sentence
  if is_transformer is False:
    sentences = [s[0] for s in data]
    paths = [s[1] for s in data]
    moves = [s[2] for s in data]
    trees = [s[3] for s in data]
    return sentences, paths, moves, trees
  else:
    sentences = [s[0] for s in data]
    paths = [s[1] for s in data]
    moves = [s[2] for s in data]
    trees = [s[3] for s in data]
    input_ids = [s[4] for s in data]
    connector =  [s[5] for s in data]
    attention_mask = [s[6] for s in data]
    return sentences, paths, moves, trees, input_ids, connector, attention_mask

def process_sample(sample, get_gold_path = False, is_transformer = False):

  # put sentence and gold tree in our format
  sentence = ["<ROOT>"] + sample["new_tokens"]
  gold = [-1] + [int(i) for i in sample["new_head"]]  #heads in the gold tree are strings, we convert them to int

  # embedding ids of sentence words
  enc_sentence = [emb_dictionary[word] if word in emb_dictionary else emb_dictionary["<unk>"] for word in sentence]

  # gold_path and gold_moves are parallel arrays whose elements refer to parsing steps
  gold_path = []   # record two topmost stack tokens and first buffer token for current step
  gold_moves = []  # contains oracle (canonical) move for current step: 0 is left, 1 right, 2 reduce, 3 shift

  if get_gold_path:  # only for training
    parser = ArcEager(sentence)
    oracle = Oracle(parser, gold)

    while not parser.is_tree_final():

      # save configuration
      configuration = [parser.stack[len(parser.stack)-2], parser.stack[len(parser.stack)-1]]
      if len(parser.buffer) == 0:
        configuration.append(-1)
      else:
        configuration.append(parser.buffer[0])
      gold_path.append(configuration)

      # save gold move
      if oracle.is_left_arc_gold():
        gold_moves.append(0)
        parser.left_arc()
      elif oracle.is_right_arc_gold():
        parser.right_arc()
        gold_moves.append(1)
      elif oracle.is_shift_gold():
        parser.shift()
        gold_moves.append(2)
      elif oracle.is_reduce_gold():
        parser.reduce()
        gold_moves.append(3)

  if is_transformer is False:
    return enc_sentence, gold_path, gold_moves, gold
  else:
   connector = []
   connector.append(1)
   for i, word in enumerate(sample["new_tokens"]):
      connector.append(sample["word_ids"].index(i))
   return enc_sentence, gold_path, gold_moves, gold, sample["input_ids"], connector, sample["attention_mask"]

In [None]:
#@title Loading Data set and seperate
dataset = load_dataset('universal_dependencies', 'en_lines', split="train")


train_dataset = load_dataset('universal_dependencies', 'en_lines', split="train")
dev_dataset = load_dataset('universal_dependencies', 'en_lines', split="validation")
test_dataset = load_dataset('universal_dependencies', 'en_lines', split="test")

# info about dataset
print("Dataset lenght:", len(train_dataset)+len(dev_dataset)+len(test_dataset))
print("Keys: ", train_dataset[1].keys())

sent_len = [len(sentence) for sentence in train_dataset['tokens']]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading metadata: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading and preparing dataset universal_dependencies/en_lines to /root/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/580k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/199k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1032 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1035 [00:00<?, ? examples/s]

Dataset universal_dependencies downloaded and prepared to /root/.cache/huggingface/datasets/universal_dependencies/en_lines/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7. Subsequent calls will reuse this data.




Dataset lenght: 5243
Keys:  dict_keys(['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'])


In [None]:
#@title Extraction
def fillNone(x):
  x['new_head'] = []
  x['new_tokens'] = []
  for index, elem in enumerate(x['head']):
    if elem != 'None':
      x['new_head'].append(elem)
      x['new_tokens'].append(x['tokens'][index])

  return x
train_dataset = train_dataset.map(fillNone)
dev_dataset = dev_dataset.map(fillNone)
test_dataset = test_dataset.map(fillNone)
train_dataset =[sample for sample in train_dataset if is_projective([-1] + [int(head) for head in sample["new_head"]])]

# create the embedding dictionary
emb_dictionary = create_dict(train_dataset)

print("***Number of samples***")
print("Train (filtered):\t", len(train_dataset)) #(train is the number of samples without the non-projective)
print("Dev:\t", len(dev_dataset))
print("Test:\t", len(test_dataset))

Map:   0%|          | 0/3176 [00:00<?, ? examples/s]

Map:   0%|          | 0/1032 [00:00<?, ? examples/s]

Map:   0%|          | 0/1035 [00:00<?, ? examples/s]

***Number of samples***
Train (filtered):	 2922
Dev:	 1032
Test:	 1035


In [None]:
#@title Parameters
EMBEDDING_SIZE = 200
LSTM_SIZE = 200
LSTM_LAYERS = 1
MLP_SIZE = 200
DROPOUT = 0.2
EPOCHS = 15
LR = 0.001   # learning rate
BATCH_SIZE=16

In [None]:
#@title Dataloaders for the NN
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=partial(prepare_batch, get_gold_path=True))
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=partial(prepare_batch))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=partial(prepare_batch))

In [None]:
#@title Model for Bi-LSTM
class Net(nn.Module):

  def __init__(self, device):
    super(Net, self).__init__()
    self.device = device
    self.embeddings = nn.Embedding(len(emb_dictionary), EMBEDDING_SIZE, padding_idx=emb_dictionary["<pad>"])

    # initialize bi-LSTM
    self.lstm = nn.LSTM(EMBEDDING_SIZE, LSTM_SIZE, num_layers = LSTM_LAYERS, bidirectional=True, dropout=DROPOUT)

    # initialize feedforward
    self.w1 = torch.nn.Linear(6*LSTM_SIZE, MLP_SIZE, bias=True)
    self.activation = torch.nn.Tanh()
    self.w2 = torch.nn.Linear(MLP_SIZE, 4, bias=True)
    self.softmax = torch.nn.Softmax(dim=-1)

    self.dropout = torch.nn.Dropout(DROPOUT)


  def forward(self, x, paths):
    # get the embeddings
    x = [self.dropout(self.embeddings(torch.tensor(i).to(self.device))) for i in x]

    # run the bi-lstm
    h = self.lstm_pass(x)

    # for each parser configuration that we need to score we arrange from the
    # output of the bi-lstm the correct input for the feedforward
    mlp_input = self.get_mlp_input(paths, h)

    # run the feedforward and get the scores for each possible action
    out = self.mlp(mlp_input)

    return out

  def lstm_pass(self, x):
    x = torch.nn.utils.rnn.pack_sequence(x, enforce_sorted=False)
    h, (h_0, c_0) = self.lstm(x)
    h, h_sizes = torch.nn.utils.rnn.pad_packed_sequence(h) # size h: (length_sentences, batch, output_hidden_units)
    return h

  def get_mlp_input(self, configurations, h):
    mlp_input = []
    zero_tensor = torch.zeros(2*LSTM_SIZE, requires_grad=False).to(self.device)
    for i in range(len(configurations)): # for every sentence in the batch
      for j in configurations[i]: # for each configuration of a sentence
        mlp_input.append(torch.cat([zero_tensor if j[0]==-1 else h[j[0]][i], zero_tensor if j[1]==-1 else h[j[1]][i], zero_tensor if j[2]==-1 else h[j[2]][i]]))
    mlp_input = torch.stack(mlp_input).to(self.device)
    return mlp_input

  def mlp(self, x):
    return self.softmax(self.w2(self.dropout(self.activation(self.w1(self.dropout(x))))))

  # we use this function at inference time. We run the parser and at each step
  # we pick as next move the one with the highest score assigned by the model
  def infere(self, x):

    parsers = [ArcEager(i) for i in x]

    x = [self.embeddings(torch.tensor(i).to(self.device)) for i in x]

    h = self.lstm_pass(x)

    while not self.parsed_all(parsers):
      # get the current configuration and score next moves
      configurations = self.get_configurations(parsers)
      mlp_input = self.get_mlp_input(configurations, h)
      mlp_out = self.mlp(mlp_input)
      # take the next parsing step
      self.parse_step(parsers, mlp_out)

    # return the predicted dependency tree
    return [parser.arcs for parser in parsers]

  def get_configurations(self, parsers):
    configurations = []

    for parser in parsers:
      if parser.is_tree_final():
        conf = [-1, -1, -1]
      else:
        conf = [parser.stack[len(parser.stack)-2], parser.stack[len(parser.stack)-1]]
        if len(parser.buffer) == 0:
          conf.append(-1)
        else:
          conf.append(parser.buffer[0])
      configurations.append([conf])

    return configurations

  def parsed_all(self, parsers):
    for parser in parsers:
      if not parser.is_tree_final():
        return False
    return True

  # In this function we select and perform the next move according to the scores obtained.
  # We need to be careful and select correct moves, e.g. don't do a shift if the buffer
  # is empty or a left arc if σ2 is the ROOT. For clarity sake we didn't implement
  # these checks in the parser so we must do them here. This renders the function quite ugly
  def parse_step(self, parsers, moves):
      moves_argm = moves.argmax(-1)
      for i in range(len(parsers)):
          if parsers[i].is_tree_final():
              continue
          else:
              # Left arc
              if moves_argm[i] == 0:
                  stack_len = len(parsers[i].stack)
                  if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                      parsers[i].left_arc()
                  else:
                    if len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                      parsers[i].right_arc()
                    elif len(parsers[i].stack) >= 2:
                      parsers[i].reduce()
                    else:
                      parsers[i].shift()

              # Right arc
              elif moves_argm[i] == 1:
                  stack_len = len(parsers[i].stack)
                  if stack_len >= 2 and len(parsers[i].buffer) > 0:
                      parsers[i].right_arc()
                  else:
                      if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                        parsers[i].left_arc()
                      elif len(parsers[i].stack) >= 2:
                        parsers[i].reduce()
                      else:
                        parsers[i].shift()

              # Shift
              elif moves_argm[i] == 2:
                  if len(parsers[i].buffer) > 0:
                      parsers[i].shift()
                  else:
                    if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                        parsers[i].left_arc()
                    elif len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                      parsers[i].right_arc()
                    elif len(parsers[i].stack) >= 2:
                      parsers[i].reduce()

              # Reduce
              elif moves_argm[i] == 3:
                  if len(parsers[i].stack) >= 2:
                      parsers[i].reduce()
                  else:
                    if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                        parsers[i].left_arc()
                    elif len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                      parsers[i].right_arc()
                    else:
                      parsers[i].shift()

In [None]:
#@title Functions for training and evaluation for BILSTM

# Evaluation
def evaluate(gold, preds):
  total = 0
  correct = 0

  for g, p in zip(gold, preds):
    for i in range(1,len(g)):
      total += 1
      if g[i] == p[i]:
        correct += 1

  return correct/total

# Training
def train(model, dataloader, criterion, optimizer):
  model.train()
  total_loss = 0
  count = 0

  for batch in dataloader:
    optimizer.zero_grad()
    sentences, paths, moves, trees = batch

    out = model(sentences, paths)
    labels = torch.tensor(sum(moves, [])).to(device)
    loss = criterion(out, labels)

    count +=1
    total_loss += loss.item()

    loss.backward()
    optimizer.step()

  return total_loss/count

# Testing
def test(model, dataloader):
  model.eval()

  gold = []
  preds = []

  for batch in dataloader:
    sentences, paths, moves, trees = batch
    with torch.no_grad():
      pred = model.infere(sentences)

      gold += trees
      preds += pred

  return evaluate(gold, preds)

##INFORMATION PART
1.Dataset analysis: When we analyzing a treebank dataset, we consider :

  1.Size: The treebank contains X sentences, Y words, and Z unique words, providing an understanding of the dataset's scale.
  2.Sentence Length Distribution: An analysis of sentence length distribution aids in model decisions, particularly concerning the maximum sequence length.
  3.Part-of-speech (POS) Distribution: Examining the distribution of POS tags uncovers the linguistic composition of the dataset.
  4.Tree Depth: The distribution of tree depths in the treebank indicates the complexity level of the sentences.
  5.Syntactic Relations: Investigating the frequency of various syntactic relations helps illuminate the grammar structure within the treebank.
  6.Outliers: Identifying outliers such as unusually long or short sentences or rare POS sequences can highlight unique aspects of the data.
  7.Source and Collection Methodology: The origin and collection process of the treebank significantly influence its linguistic characteristics and should be taken into account.

2.Description of baseline model and BERT-based model.

  Baseline Model: A simple model used as a reference point for comparing the performance of more advanced models.
  BERT-based Model: A sophisticated model for natural language processing tasks that uses context from surrounding words for understanding and is often fine-tuned for specific tasks.

3.Data set-up and training

  We chose English as the language because we could find more resources that we could analyze and compare more easily. This helped us find more resources and draw our roadmap better while doing research.



In [None]:
#@title Train BİLSTM
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
model = Net(device)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


for epoch in range(EPOCHS):
  avg_train_loss = train(model, train_dataloader, criterion, optimizer)
  val_uas = test(model, dev_dataloader)

  print("Epoch: {:3d} | avg_train_loss: {:5.3f} | dev_uas: {:5.3f} |".format( epoch, avg_train_loss, val_uas))

Device: cuda




Epoch:   0 | avg_train_loss: 1.021 | dev_uas: 0.562 |
Epoch:   1 | avg_train_loss: 0.933 | dev_uas: 0.613 |
Epoch:   2 | avg_train_loss: 0.910 | dev_uas: 0.639 |
Epoch:   3 | avg_train_loss: 0.894 | dev_uas: 0.647 |
Epoch:   4 | avg_train_loss: 0.882 | dev_uas: 0.650 |
Epoch:   5 | avg_train_loss: 0.874 | dev_uas: 0.665 |
Epoch:   6 | avg_train_loss: 0.866 | dev_uas: 0.670 |
Epoch:   7 | avg_train_loss: 0.861 | dev_uas: 0.670 |
Epoch:   8 | avg_train_loss: 0.855 | dev_uas: 0.682 |
Epoch:   9 | avg_train_loss: 0.849 | dev_uas: 0.686 |
Epoch:  10 | avg_train_loss: 0.841 | dev_uas: 0.681 |
Epoch:  11 | avg_train_loss: 0.813 | dev_uas: 0.691 |
Epoch:  12 | avg_train_loss: 0.809 | dev_uas: 0.695 |
Epoch:  13 | avg_train_loss: 0.805 | dev_uas: 0.695 |
Epoch:  14 | avg_train_loss: 0.802 | dev_uas: 0.694 |


In [None]:
#@title BiLSTM evaluation
test_uas = test(model, test_dataloader)
print("test_uas: {:5.3f}".format( test_uas))

test_uas: 0.687


BERT MODEL

In [None]:
#@title Parameters for BERT
MLP_SIZE = 200
DROPOUT = 0.2
EPOCHS = 4
LR = 0.001
BATCH_SIZE = 8
OUT_FEATURES = 768

In [None]:
#@title BERT MODEL
#confguration of net model bert instead of bilstm
from transformers import BertModel, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding
#@title Class
from transformers import BertModel, TrainingArguments, Trainer, AutoTokenizer, DataCollatorWithPadding
class BERTNet(nn.Module):

  def __init__(self, device):
    super(BERTNet, self).__init__()
    self.device = device

    # initialize BERT
    self.bert = BertModel.from_pretrained("bert-base-multilingual-uncased", output_hidden_states=True)
    # Freeze the first 3 layers
    for param in self.bert.encoder.layer[:3].parameters():
            param.requires_grad = False
    # initialize feedforward
    self.w1 = torch.nn.Linear(3*OUT_FEATURES, MLP_SIZE, bias=True)
    self.activation = torch.nn.Tanh()
    self.w2 = torch.nn.Linear(MLP_SIZE, 4, bias=True)
    self.softmax = torch.nn.Softmax(dim=-1)

    self.dropout = torch.nn.Dropout(DROPOUT)


  def forward(self, x, paths, connector, attention_mask):

    # run BERT
    h = self.bert_pass(x, attention_mask)

    # for each parser configuration that we need to score we arrange from the
    # output of BERT the correct input for the feedforward
    mlp_input = self.get_mlp_input(paths, h, connector)

    # run the feedforward and get the scores for each possible action
    out = self.mlp(mlp_input)

    return out

  def bert_pass(self, x, attention):
    # Generates embeddings utilizing BERT
    # Tensor initialization
    x = torch.tensor([x]).to(self.device)
    x = torch.squeeze(x, 0)
    attention = torch.tensor([attention]).to(self.device)
    attention = torch.squeeze(attention, 0)

    # BERT pass and last layers extraction
    h = self.bert(input_ids=x)
    summed_last_4_layers = torch.stack(h.hidden_states[-4:]).sum(0)

    # (batch, len_sent, hid) -> (len_sent, batch, hid)
    h = summed_last_4_layers.permute(1,0,2)

    return h

  def get_mlp_input(self, configurations, h, connector):
    mlp_input = []
    zero_tensor = torch.zeros(OUT_FEATURES, requires_grad=False).to(self.device)
    for i in range(len(configurations)): # for every sentence in the batch
      for j in configurations[i]: # for each configuration of a sentence

        mlp_input.append(torch.cat([zero_tensor if j[0]==-1 else h[ connector[i][ j[0] ] ][i],
                                    zero_tensor if j[1]==-1 else h[ connector[i][ j[1] ] ][i],
                                    zero_tensor if j[2]==-1 else h[ connector[i][ j[2] ] ][i]]))
    mlp_input = torch.stack(mlp_input).to(self.device)
    return mlp_input

  def mlp(self, x):
    return self.softmax(self.w2(self.dropout(self.activation(self.w1(self.dropout(x))))))

  # we use this function at inference time. We run the parser and at each step
  # we pick as next move the one with the highest score assigned by the model
  def infere(self, x, sentences, attention, connector, return_confusion = False):
    parsers = [ArcEager(i) for i in sentences]


    h = self.bert_pass(x, attention)
    confusion = np.zeros((4,4))
    while not self.parsed_all(parsers):
      # get the current configuration and score next moves
      configurations = self.get_configurations(parsers)
      mlp_input = self.get_mlp_input(configurations, h, connector)
      mlp_out = self.mlp(mlp_input)
      # take the next parsing step
      if return_confusion is False:
        self.parse_step(parsers, mlp_out)
      else:
        confusion += self.parse_step(parsers, mlp_out, return_confusion = return_confusion)

    # return the predicted dependency tree
    if return_confusion is False:
      return [parser.arcs for parser in parsers]
    else:
      return confusion

  def get_configurations(self, parsers):
    configurations = []

    for parser in parsers:
      if parser.is_tree_final():
        conf = [-1, -1, -1]
      else:
        conf = [parser.stack[len(parser.stack)-2], parser.stack[len(parser.stack)-1]]
        if len(parser.buffer) == 0:
          conf.append(-1)
        else:
          conf.append(parser.buffer[0])
      configurations.append([conf])

    return configurations

  def parsed_all(self, parsers):
    for parser in parsers:
      if not parser.is_tree_final():
        return False
    return True

  # In this function we select and perform the next move according to the scores obtained.
  # We need to be careful and select correct moves, e.g. don't do a shift if the buffer
  # is empty or a reduce if we only have the ROOT.
  def parse_step(self, parsers, moves, return_confusion = False):
      moves_argm = moves.argmax(-1)
      if return_confusion is False:
        for i in range(len(parsers)):
            if parsers[i].is_tree_final():
                continue
            else:
                # Left arc
                if moves_argm[i] == 0:
                    stack_len = len(parsers[i].stack)
                    if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                        parsers[i].left_arc()
                    else:
                      if len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                        parsers[i].right_arc()
                      elif len(parsers[i].stack) >= 2:
                        parsers[i].reduce()
                      else:
                        parsers[i].shift()

                # Right arc
                elif moves_argm[i] == 1:
                    stack_len = len(parsers[i].stack)
                    if stack_len >= 2 and len(parsers[i].buffer) > 0:
                        parsers[i].right_arc()
                    else:
                        if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                          parsers[i].left_arc()
                        elif len(parsers[i].stack) >= 2:
                          parsers[i].reduce()
                        else:
                          parsers[i].shift()

                # Shift
                elif moves_argm[i] == 2:
                    if len(parsers[i].buffer) > 0:
                        parsers[i].shift()
                    else:
                      if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                          parsers[i].left_arc()
                      elif len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                        parsers[i].right_arc()
                      elif len(parsers[i].stack) >= 2:
                        parsers[i].reduce()

                # Reduce
                elif moves_argm[i] == 3:
                    if len(parsers[i].stack) >= 2:
                        parsers[i].reduce()
                    else:
                      if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                          parsers[i].left_arc()
                      elif len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                        parsers[i].right_arc()
                      else:
                        parsers[i].shift()
            # for the confusion matrix, same as above but a matrix is updated
      elif return_confusion is True:
          confusion = np.zeros((4,4))
          for i in range(len(parsers)):
            if parsers[i].is_tree_final():
                continue
            else:
                # Left arc
                if moves_argm[i] == 0:
                    stack_len = len(parsers[i].stack)
                    if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                        parsers[i].left_arc()
                        confusion[0,0] += 1
                    else:
                      if len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                        parsers[i].right_arc()
                        confusion[0,1] += 1
                      elif len(parsers[i].stack) >= 2:
                        parsers[i].reduce()
                        confusion[0,3] += 1
                      else:
                        parsers[i].shift()
                        confusion[0,2] += 1

                # Right arc
                elif moves_argm[i] == 1:
                    stack_len = len(parsers[i].stack)
                    if stack_len >= 2 and len(parsers[i].buffer) > 0:
                        parsers[i].right_arc()
                        confusion[1,1] += 1
                    else:
                        if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                          parsers[i].left_arc()
                          confusion[1,0] += 1
                        elif len(parsers[i].stack) >= 2:
                          parsers[i].reduce()
                          confusion[1,3] += 1
                        else:
                          parsers[i].shift()
                          confusion[1,2] += 1

                # Shift
                elif moves_argm[i] == 2:
                    if len(parsers[i].buffer) > 0:
                        parsers[i].shift()
                        confusion[2,2] += 1
                    else:
                      if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                          parsers[i].left_arc()
                          confusion[2,0] += 1
                      elif len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                        parsers[i].right_arc()
                        confusion[2,1] += 1
                      elif len(parsers[i].stack) >= 2:
                        parsers[i].reduce()
                        confusion[2,3] += 1

                # Reduce
                elif moves_argm[i] == 3:
                    if len(parsers[i].stack) >= 2:
                        parsers[i].reduce()
                        confusion[3,3] += 1
                    else:
                      if parsers[i].stack[-1] != 0 and len(parsers[i].buffer) > 0:
                          parsers[i].left_arc()
                          confusion[3,0] += 1
                      elif len(parsers[i].stack) >= 2 and len(parsers[i].buffer) > 0:
                        parsers[i].right_arc()
                        confusion[3,1] += 1
                      else:
                        parsers[i].shift()
                        confusion[3,2] += 1
          return confusion

In [None]:
#@title Functions for training and evaluation for BERT

# Evaluation
def evaluate_bert (gold, preds):
  total = 0
  correct = 0
  for g, p in zip(gold, preds):
    for i in range(1,len(g)):
      total += 1
      if g[i] == p[i]:
        correct += 1

  return correct/total

# Training
def train_bert (model, dataloader, criterion, optimizer):
  model.train()
  total_loss = 0
  count = 0

  for batch in dataloader:
    optimizer.zero_grad()
    sentences, paths, moves, trees, indices_ids , connector, attention_mask = batch

    out = model(indices_ids, paths, connector, attention_mask)
    labels = torch.tensor(sum(moves, [])).to(device) #sum(moves, []) flatten the array
    loss = criterion(out, labels)

    count +=1
    total_loss += loss.item()

    loss.backward()
    optimizer.step()

  return total_loss/count

# Testing
def test_bert (model, dataloader,return_confusion=False):
  model.eval()

  gold = []
  preds = []
  confusion = np.zeros((4,4))
  for batch in dataloader:
    sentences, paths, moves, trees, indices_ids , connector, attention_mask = batch
    with torch.no_grad():
      if return_confusion is False:
        pred = model.infere(indices_ids, sentences, attention_mask, connector)

        gold += trees
        preds += pred
      else:
        confusion+= model.infere(indices_ids, sentences, attention_mask, connector, return_confusion=return_confusion)
  if return_confusion is False:
    return evaluate_bert(gold, preds)
  else:
    return confusion

In [None]:
#@title Util function for tokenization
def segment_and_match_labels(example):
    example['new_head'] = []
    example['new_tokens'] = []
    for index, elem in enumerate(example['head']):
      if elem != 'None':
        example['new_head'].append(elem)
        example['new_tokens'].append(example['tokens'][index])

    tokens = example["new_tokens"]
    heads = example["new_head"]

    # Tokenize the tokens using BERT tokenizer
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True, padding='max_length')
    input_ids = tokenized_inputs['input_ids']
    attention_mask = tokenized_inputs['attention_mask']
    word_ids = tokenized_inputs.word_ids()

    # Return the transformed sample
    sample = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'word_ids' : word_ids, #word_ids: [None, 0, 0, 0, 0, 1, 1, 2, 3, 4, 5, 6, None]
        'new_tokens': tokens,
        'new_head': heads
    }

    return sample

train_dataset = load_dataset('universal_dependencies', 'en_lines', split="train")
dev_dataset = load_dataset('universal_dependencies', 'en_lines', split="validation")
test_dataset = load_dataset('universal_dependencies', 'en_lines', split="test")



In [None]:
#@title Setup
import evaluate

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
#@title Data Preparation
from torch.utils.data import DataLoader
train_dataset = train_dataset.map(segment_and_match_labels)
dev_dataset = dev_dataset.map(segment_and_match_labels)
test_dataset = test_dataset.map(segment_and_match_labels)

# Remove non projective trees
train_dataset =[sample for sample in train_dataset if is_projective([-1] + [int(head) for head in sample["new_head"]])]

# Create the dataloaders
train_dataloader_bert = DataLoader(train_dataset, batch_size = BATCH_SIZE,
                                collate_fn = partial(prepare_batch, get_gold_path=True, is_transformer = True))
dev_dataloader_bert = DataLoader(dev_dataset, batch_size = BATCH_SIZE,
                                collate_fn = partial(prepare_batch, is_transformer = True))
test_dataloader_bert = DataLoader(test_dataset, batch_size = BATCH_SIZE,
                                collate_fn = partial(prepare_batch, is_transformer = True))

Map:   0%|          | 0/3176 [00:00<?, ? examples/s]

Map:   0%|          | 0/1032 [00:00<?, ? examples/s]

Map:   0%|          | 0/1035 [00:00<?, ? examples/s]

In [None]:
#@title Training

device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")
transformer = BERTNet(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=LR)
transformer.to(device)

for epoch in range(EPOCHS):
  avg_train_loss = train_bert (transformer, train_dataloader_bert, criterion, optimizer)
  val_uas = test_bert (transformer, dev_dataloader_bert)

  torch.cuda.empty_cache()
  _ = gc.collect()

  print("Epoch: {:3d} | avg_train_loss: {:5.3f} | dev_uas: {:5.3f} |".format( epoch, avg_train_loss, val_uas))

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0 | avg_train_loss: 0.845 | dev_uas: 0.846 |
Epoch:   1 | avg_train_loss: 0.804 | dev_uas: 0.852 |
Epoch:   2 | avg_train_loss: 0.797 | dev_uas: 0.864 |
Epoch:   3 | avg_train_loss: 0.793 | dev_uas: 0.871 |


In [None]:
#@title BERT evaluation
test_uas_transformer = test_bert (transformer, test_dataloader_bert)
print("test_uas_transformer: {:5.3f}".format(test_uas_transformer))

test_uas_transformer: 0.874


## Error analysis
The first part of this analysis focuses on the next predicted move by the model in comparison to the actual move performed, given the restrictions of the ArcEager model. For instance, this could involve examining a reduction on the ROOT or a right/left arc when the buffer is empty.

The second part of the analysis seeks to identify patterns among the sentences that are misclassified. We aim to pinpoint any commonalities that could potentially contribute to these errors.