In [27]:
import copy

import torch
import torch.nn as nn

import numpy as np
from datasets import load_dataset, get_dataset_config_names

In [28]:
name = "universal_dependencies"
ud_config = get_dataset_config_names(name)
ud_ewt_train = load_dataset(name, 'en_ewt', split="train")
ud_ewt_dev = load_dataset(name, 'en_ewt', split="validation")
ud_ewt_test = load_dataset(name, 'en_ewt', split="test")

In [29]:
#ud_ewt_train[34] is good default
ud_ewt_train[34]['tokens']
ud_ewt_train[34]['upos'] #Part of speech tag as an integer index of names=["NOUN","PUNCT","ADP","NUM","SYM","SCONJ","ADJ","PART","DET","CCONJ","PROPN","PRON","X","_","ADV","INTJ","VERB","AUX"]
ud_ewt_train[34]['upos'] #Other POS tag, might be more accurate?
ud_ewt_train[34]['deprel'] #arc labels by themselves. https://universaldependencies.org/docs/u/dep/index.html
ud_ewt_train[34]['deps'] #arc labels and the index they interact with. NOTE: index starts at one, ROOT is assumed
ud_ewt_train[34]

{'idx': 'weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0035',
 'text': 'The situation in Iraq is only going to get better this way.',
 'tokens': ['The',
  'situation',
  'in',
  'Iraq',
  'is',
  'only',
  'going',
  'to',
  'get',
  'better',
  'this',
  'way',
  '.'],
 'lemmas': ['the',
  'situation',
  'in',
  'Iraq',
  'be',
  'only',
  'go',
  'to',
  'get',
  'better',
  'this',
  'way',
  '.'],
 'upos': [8, 0, 2, 10, 17, 14, 16, 7, 16, 6, 8, 0, 1],
 'xpos': ['DT',
  'NN',
  'IN',
  'NNP',
  'VBZ',
  'RB',
  'VBG',
  'TO',
  'VB',
  'JJR',
  'DT',
  'NN',
  '.'],
 'feats': ["{'Definite': 'Def', 'PronType': 'Art'}",
  "{'Number': 'Sing'}",
  'None',
  "{'Number': 'Sing'}",
  "{'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}",
  'None',
  "{'Tense': 'Pres', 'VerbForm': 'Part'}",
  'None',
  "{'VerbForm': 'Inf'}",
  "{'Degree': 'Cmp'}",
  "{'Number': 'Sing', 'PronType': 'Dem'}",
  "{'Number': 'Sing'}",
  'None'],
 'head': ['2', '

In [30]:
#test sentence from exmaple in chen and manning
test_sent = {'text': 'He has really good control.',
             'tokens': ['He', 'has', 'good', 'control', '.'],
             'upos': [11, 16, 6, 0, 1], #probably need to use xpos
             'deps': [[('nsubj', 2)], [('root', 0)], [('amod', 4)], [('dobj', 2)], [('punct', 2)]]
             }
stack = ["[ROOT]"]
buffer = copy.deepcopy(test_sent)
arcs = []

In [31]:
#LEFT-ARC(l): adds an arc s1 → s2 with label l and removes s2 from the stack. Precondition: |s| ≥ 2. from Chen and Manning
def left_arc(stack, buffer, arcs):
    if len(stack) >= 2:
        #Left arc logic
        return (stack, buffer, arcs)
    print("[@] LEFT-ARC called incorrectly, check stack size")
    print("Stack: ", stack)

In [32]:
#RIGHT-ARC(l): adds an arc s2 → s1 with label l and removes s1 from the stack. Precondition: |s| ≥ 2. from Chen and Manning
def right_arc(stack, buffer, arcs):
    if len(stack) >= 2:
        #right arc logic
        return (stack, buffer, arcs)
    print("[@] RIGHT-ARC called incorrectly, check stack size")
    print("Stack: ", stack)

In [33]:
# SHIFT: moves b1 from the buffer to the stack. Precondition: |b| ≥ 1. Precondition: |s| ≥ 2. from Chen and Manning
def shift(stack, buffer, arcs):
    if len(buffer) >= 1 and len(stack) >= 2:
        stack.add(buffer.pop(0))
        return (stack, buffer, arcs)
    print("[@] SHIFT called incorrectly, check stack and buffer sizes")
    print("Stack: ", stack)
    print("Buffer: ", buffer)