In [140]:
import copy
import ast

import torch
import torch.nn as nn

import numpy as np
from datasets import load_dataset, get_dataset_config_names

In [40]:
name = "universal_dependencies"
ud_config = get_dataset_config_names(name)
ud_ewt_train = load_dataset(name, 'en_ewt', split="train")
ud_ewt_dev = load_dataset(name, 'en_ewt', split="validation")
ud_ewt_test = load_dataset(name, 'en_ewt', split="test")

In [41]:
#ud_ewt_train[34] is good default
ud_ewt_train[34]['tokens']
ud_ewt_train[34]['upos'] #Part of speech tag as an integer index of ["NOUN","PUNCT","ADP","NUM","SYM","SCONJ","ADJ","PART","DET","CCONJ","PROPN","PRON","X","_","ADV","INTJ","VERB","AUX"]
ud_ewt_train[34]['upos'] #Other POS tag, might be more accurate?
ud_ewt_train[34]['deprel'] #arc labels by themselves. https://universaldependencies.org/docs/u/dep/index.html
ud_ewt_train[34]['deps'] #arc labels and the index they interact with. NOTE: index starts at one, ROOT is assumed
ud_ewt_train[34]

{'idx': 'weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0035',
 'text': 'The situation in Iraq is only going to get better this way.',
 'tokens': ['The',
  'situation',
  'in',
  'Iraq',
  'is',
  'only',
  'going',
  'to',
  'get',
  'better',
  'this',
  'way',
  '.'],
 'lemmas': ['the',
  'situation',
  'in',
  'Iraq',
  'be',
  'only',
  'go',
  'to',
  'get',
  'better',
  'this',
  'way',
  '.'],
 'upos': [8, 0, 2, 10, 17, 14, 16, 7, 16, 6, 8, 0, 1],
 'xpos': ['DT',
  'NN',
  'IN',
  'NNP',
  'VBZ',
  'RB',
  'VBG',
  'TO',
  'VB',
  'JJR',
  'DT',
  'NN',
  '.'],
 'feats': ["{'Definite': 'Def', 'PronType': 'Art'}",
  "{'Number': 'Sing'}",
  'None',
  "{'Number': 'Sing'}",
  "{'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}",
  'None',
  "{'Tense': 'Pres', 'VerbForm': 'Part'}",
  'None',
  "{'VerbForm': 'Inf'}",
  "{'Degree': 'Cmp'}",
  "{'Number': 'Sing', 'PronType': 'Dem'}",
  "{'Number': 'Sing'}",
  'None'],
 'head': ['2', '

In [156]:
#test sentence from exmaple in chen and manning
test_sent = {'text': 'He has really good control.',
             'tokens': ['He', 'has', 'good', 'control', '.'],
             'upos': [11, 16, 6, 0, 1], #probably need to use xpos,
             'xpos': ['PRP', 'VBZ', 'JJ', 'NN', '.'],
             'deprel': ['nsubj', 'root', 'amod', 'dobj', 'punct'],
             'deps': ["[('nsubj', 2)]", "[('root', 0)]", "[('amod', 4)]", "[('dobj', 2)]", "[('punct', 2)]"] #these are strings for some reason
             }

In [105]:
#LEFT-ARC(l): adds an arc s1 → s2 with label l and removes s2 from the stack. Precondition: |s| ≥ 2. from Chen and Manning
def left_arc(stack, buffer, arcs, dep):
    if len(stack) < 2:
        print("[@] LEFT-ARC called incorrectly, check stack size")
        # return
    elif stack[-2] == "[ROOT]":
        print("[@] LEFT-ARC called incorrectly, tried to add depepndency to ROOT")
    else:
        arcs.append((dep, (stack[-1],stack.pop(-2))))
    print("Stack: ", stack, end=" | ")
    print("Buffer: ", buffer, end=" | ")
    print("Arcs:", arcs)

In [106]:
#RIGHT-ARC(l): adds an arc s2 → s1 with label l and removes s1 from the stack. Precondition: |s| ≥ 2. from Chen and Manning
def right_arc(stack, buffer, arcs, dep):
    if len(stack) < 2:
        print("[@] RIGHT-ARC called incorrectly, check stack size")
        # return
    else:
        arcs.append((dep, (stack[-2],stack.pop(-1))))
    print("Stack: ", stack, end=" | ")
    print("Buffer: ", buffer, end=" | ")
    print("Arcs:", arcs)
    

In [107]:
# SHIFT: moves b1 from the buffer to the stack. Precondition: |b| ≥ 1. from Chen and Manning
def shift(stack, buffer, arcs):
    if len(buffer) < 1:
        print("[@] SHIFT called incorrectly, check buffer size")
        # return
    else:
        stack.append(buffer.pop(0))
    print("Stack: ", stack, end=" | ")
    print("Buffer: ", buffer, end=" | ")
    print("Arcs:", arcs)

In [108]:
#Goal is to predict the correct transitions at each step aka predict the deps and timing of deps correctly
#Sample of "correct" transitions for "He has good control."
test_stack = ["[ROOT]"]
test_buffer = copy.deepcopy(test_sent['tokens'])
# buffer = [(test_sent['tokens'][i], test_sent['tokens'][i]) for i in range(len(test_sent['tokens']))] #include xpos in buffer in the future
test_arcs = []
shift(test_stack, test_buffer, test_arcs)
shift(test_stack, test_buffer, test_arcs)
left_arc(test_stack, test_buffer, test_arcs, 'nsubj')
shift(test_stack, test_buffer, test_arcs)
shift(test_stack, test_buffer, test_arcs)
left_arc(test_stack, test_buffer, test_arcs, 'amod')
right_arc(test_stack, test_buffer, test_arcs, 'dobj')
shift(test_stack, test_buffer, test_arcs)
right_arc(test_stack, test_buffer, test_arcs, 'punct')
right_arc(test_stack, test_buffer, test_arcs, 'root')

Stack:  ['[ROOT]', 'He'] | Buffer:  ['has', 'good', 'control', '.'] | Arcs: []
Stack:  ['[ROOT]', 'He', 'has'] | Buffer:  ['good', 'control', '.'] | Arcs: []
Stack:  ['[ROOT]', 'has'] | Buffer:  ['good', 'control', '.'] | Arcs: [('nsubj', ('has', 'He'))]
Stack:  ['[ROOT]', 'has', 'good'] | Buffer:  ['control', '.'] | Arcs: [('nsubj', ('has', 'He'))]
Stack:  ['[ROOT]', 'has', 'good', 'control'] | Buffer:  ['.'] | Arcs: [('nsubj', ('has', 'He'))]
Stack:  ['[ROOT]', 'has', 'control'] | Buffer:  ['.'] | Arcs: [('nsubj', ('has', 'He')), ('amod', ('control', 'good'))]
Stack:  ['[ROOT]', 'has'] | Buffer:  ['.'] | Arcs: [('nsubj', ('has', 'He')), ('amod', ('control', 'good')), ('dobj', ('has', 'control'))]
Stack:  ['[ROOT]', 'has', '.'] | Buffer:  [] | Arcs: [('nsubj', ('has', 'He')), ('amod', ('control', 'good')), ('dobj', ('has', 'control'))]
Stack:  ['[ROOT]', 'has'] | Buffer:  [] | Arcs: [('nsubj', ('has', 'He')), ('amod', ('control', 'good')), ('dobj', ('has', 'control')), ('punct', ('has

In [109]:
test_arcs

[('nsubj', ('has', 'He')),
 ('amod', ('control', 'good')),
 ('dobj', ('has', 'control')),
 ('punct', ('has', '.')),
 ('root', ('[ROOT]', 'has'))]

In [149]:
#compare to arcs in UD format
# ["[('nsubj', 2)]", "[('root', 0)]", "[('amod', 4)]", "[('dobj', 2)]", "[('punct', 2)]"]
valid_arcs = []
for i in range(len(test_sent['deps'])):
    curr_dep = ast.literal_eval(test_sent['deps'][i])[0]
    valid_arcs.append((curr_dep[0],(((["[ROOT]"] + test_sent['tokens'])[curr_dep[1]]), test_sent['tokens'][i])))
valid_arcs

[('nsubj', ('has', 'He')),
 ('root', ('[ROOT]', 'has')),
 ('amod', ('control', 'good')),
 ('dobj', ('has', 'control')),
 ('punct', ('has', '.'))]

In [179]:
test_sent2 = ud_ewt_train[34]
valid_arcs2 = []
for i in range(len(test_sent2['deps'])):
    curr_dep = ast.literal_eval(test_sent2['deps'][i])[0]
    valid_arcs2.append((test_sent2['deprel'][i],((((["[ROOT]"] + test_sent2['tokens'])[curr_dep[1]]), (["NULL"] + test_sent2['xpos'])[curr_dep[1]]), (test_sent2['tokens'][i], test_sent2['xpos'][i])))) #use deprel for the arc name. ignoring enhanced depependencies for now
valid_arcs2

[('det', (('situation', 'NN'), ('The', 'DT'))),
 ('nsubj', (('going', 'VBG'), ('situation', 'NN'))),
 ('case', (('Iraq', 'NNP'), ('in', 'IN'))),
 ('nmod', (('situation', 'NN'), ('Iraq', 'NNP'))),
 ('aux', (('going', 'VBG'), ('is', 'VBZ'))),
 ('advmod', (('going', 'VBG'), ('only', 'RB'))),
 ('root', (('[ROOT]', '[NULL]'), ('going', 'VBG'))),
 ('mark', (('get', 'VB'), ('to', 'TO'))),
 ('xcomp', (('going', 'VBG'), ('get', 'VB'))),
 ('xcomp', (('get', 'VB'), ('better', 'JJR'))),
 ('det', (('way', 'NN'), ('this', 'DT'))),
 ('obj', (('better', 'JJR'), ('way', 'NN'))),
 ('punct', (('going', 'VBG'), ('.', '.')))]

In [111]:
all([i in valid_arcs for i in test_arcs])

True

In [None]:
#Stack/Buffer: List of tuple with arc label and tuple of (word, POS) tuples in parent, child order
#[ ( arc-label, ( (parent-word, parent-POS), (child-word, child-POS) ) ),  ]

In [192]:
#To generate training data, need to reverse the valid arcs into operations which produce them
#The "training oracle" as described in slp3 which determines which transition to do. return a list of transitions
#sentence is UD format sentence
#Stack/Buffer format: List of tuple with arc label and tuple of (word, POS) tuples in parent, child order
#[ ( arc-label, ( (parent-word, parent-POS), (child-word, child-POS) ) ), ... ]
def training_oracle(sentence):
    stack = [("[ROOT]", "NULL")]
    # buffer = copy.deepcopy(sentence['tokens'])
    buffer = [(sentence['tokens'][i], sentence['xpos'][i]) for i in range(len(sentence['tokens']))]
    arcs = []
    transitions = [] #what we return
    labeled_arcs = []
    for i in range(len(sentence['deps'])):
        curr_dep = ast.literal_eval(sentence['deps'][i])[0]
        labeled_arcs.append((sentence['deprel'][i],((((["[ROOT]"] + sentence['tokens'])[curr_dep[1]]), (["NULL"] + sentence['xpos'])[curr_dep[1]]), (sentence['tokens'][i], sentence['xpos'][i]))))
    labeled_arcs_copy = copy.deepcopy(labeled_arcs)
    unlabeled_arcs = [i[1] for i in labeled_arcs]
    print(unlabeled_arcs)
    for i in range(2*len(sentence['tokens'])): #2N transitions
        if len(stack) >= 2:
            if (stack[-1],stack[-2]) in unlabeled_arcs:
                arc_label = labeled_arcs.pop(unlabeled_arcs.index((stack[-1],stack[-2])))[0]
                transitions.append("left-arc " + arc_label)
                unlabeled_arcs.remove((stack[-1],stack[-2]))
                left_arc(stack, buffer, arcs, arc_label)
            elif (stack[-2],stack[-1]) in unlabeled_arcs and stack[-1] not in [j[0] for j in unlabeled_arcs]: #all of the dependents of the word at the top of the stack must already be assign before right arc,
                arc_label = labeled_arcs.pop(unlabeled_arcs.index((stack[-2],stack[-1])))[0]
                transitions.append("right-arc " + arc_label)
                unlabeled_arcs.remove((stack[-2],stack[-1]))
                right_arc(stack, buffer, arcs, arc_label)
        if len(transitions) <= i: #if neither arc transition has been done do shift
            transitions.append("shift")
            shift(stack, buffer, arcs)
    print("All generated arcs are in original deps list:", all([i in labeled_arcs_copy for i in arcs]))
    return transitions

In [193]:
training_oracle(test_sent)

[(('has', 'VBZ'), ('He', 'PRP')), (('[ROOT]', 'NULL'), ('has', 'VBZ')), (('control', 'NN'), ('good', 'JJ')), (('has', 'VBZ'), ('control', 'NN')), (('has', 'VBZ'), ('.', '.'))]
Stack:  [('[ROOT]', 'NULL'), ('He', 'PRP')] | Buffer:  [('has', 'VBZ'), ('good', 'JJ'), ('control', 'NN'), ('.', '.')] | Arcs: []
Stack:  [('[ROOT]', 'NULL'), ('He', 'PRP'), ('has', 'VBZ')] | Buffer:  [('good', 'JJ'), ('control', 'NN'), ('.', '.')] | Arcs: []
Stack:  [('[ROOT]', 'NULL'), ('has', 'VBZ')] | Buffer:  [('good', 'JJ'), ('control', 'NN'), ('.', '.')] | Arcs: [('nsubj', (('has', 'VBZ'), ('He', 'PRP')))]
Stack:  [('[ROOT]', 'NULL'), ('has', 'VBZ'), ('good', 'JJ')] | Buffer:  [('control', 'NN'), ('.', '.')] | Arcs: [('nsubj', (('has', 'VBZ'), ('He', 'PRP')))]
Stack:  [('[ROOT]', 'NULL'), ('has', 'VBZ'), ('good', 'JJ'), ('control', 'NN')] | Buffer:  [('.', '.')] | Arcs: [('nsubj', (('has', 'VBZ'), ('He', 'PRP')))]
Stack:  [('[ROOT]', 'NULL'), ('has', 'VBZ'), ('control', 'NN')] | Buffer:  [('.', '.')] | Arc

['shift',
 'shift',
 'left-arc nsubj',
 'shift',
 'shift',
 'left-arc amod',
 'right-arc dobj',
 'shift',
 'right-arc punct',
 'right-arc root']

In [194]:
training_oracle(ud_ewt_train[34])

[(('situation', 'NN'), ('The', 'DT')), (('going', 'VBG'), ('situation', 'NN')), (('Iraq', 'NNP'), ('in', 'IN')), (('situation', 'NN'), ('Iraq', 'NNP')), (('going', 'VBG'), ('is', 'VBZ')), (('going', 'VBG'), ('only', 'RB')), (('[ROOT]', 'NULL'), ('going', 'VBG')), (('get', 'VB'), ('to', 'TO')), (('going', 'VBG'), ('get', 'VB')), (('get', 'VB'), ('better', 'JJR')), (('way', 'NN'), ('this', 'DT')), (('better', 'JJR'), ('way', 'NN')), (('going', 'VBG'), ('.', '.'))]
Stack:  [('[ROOT]', 'NULL'), ('The', 'DT')] | Buffer:  [('situation', 'NN'), ('in', 'IN'), ('Iraq', 'NNP'), ('is', 'VBZ'), ('only', 'RB'), ('going', 'VBG'), ('to', 'TO'), ('get', 'VB'), ('better', 'JJR'), ('this', 'DT'), ('way', 'NN'), ('.', '.')] | Arcs: []
Stack:  [('[ROOT]', 'NULL'), ('The', 'DT'), ('situation', 'NN')] | Buffer:  [('in', 'IN'), ('Iraq', 'NNP'), ('is', 'VBZ'), ('only', 'RB'), ('going', 'VBG'), ('to', 'TO'), ('get', 'VB'), ('better', 'JJR'), ('this', 'DT'), ('way', 'NN'), ('.', '.')] | Arcs: []
Stack:  [('[ROO

['shift',
 'shift',
 'left-arc det',
 'shift',
 'shift',
 'left-arc case',
 'right-arc nmod',
 'shift',
 'shift',
 'shift',
 'left-arc advmod',
 'left-arc aux',
 'left-arc nsubj',
 'shift',
 'shift',
 'left-arc mark',
 'shift',
 'shift',
 'shift',
 'left-arc det',
 'right-arc obj',
 'right-arc xcomp',
 'right-arc xcomp',
 'shift',
 'right-arc punct',
 'right-arc root']

In [195]:
#Featureization. We use sets of elements Sw St and Sl as described in 3.1 of Chen and Manning which can be combined to create features
def featurize_configuration(stack, buffer, arcs):
    S_w = {'s1' : None, 's2': None, 's3': None, 'b1': None, 'b2': None, 'b3': None,
           'lc1s1' : None, 'lc2s1' : None, 'lc1s2' : None, 'lc2s2' : None, 'rc1s1' : None, 'rc2s1' : None, 'rc1s2' : None, 'rc2s2' : None, #lc1s1 is leftmost child of s1, lc2s1 is second leftmost
           'lc1lc1s1': None, 'lc1lc1s2': None, 'rc1rc1s1': None, 'rc1rc1s2': None} #lc1lc1 is left most of leftmost children
    S_t = {'s1' : None, 's2': None, 's3': None, 'b1': None, 'b2': None, 'b3': None,
           'lc1s1' : None, 'lc2s1' : None, 'lc1s2' : None, 'lc2s2' : None, 'rc1s1' : None, 'rc2s1' : None, 'rc1s2' : None, 'rc2s2' : None,
           'lc1lc1s1': None, 'lc1lc1s2': None, 'rc1rc1s1': None, 'rc1rc1s2': None}
    S_l = {'lc1s1' : None, 'lc2s1' : None, 'lc1s2' : None, 'lc2s2' : None, 'rc1s1' : None, 'rc2s1' : None, 'rc1s2' : None, 'rc2s2' : None,
           'lc1lc1s1': None, 'lc1lc1s2': None, 'rc1rc1s1': None, 'rc1rc1s2': None}
    if len(stack) >= 1:
        S_w['s1'] = stack[-1]
        S_t['s1'] = stack[-1] #TODO: Need to determine how I'm inputting POS. If XPOS is in tuple in the stack/buffer, need to modify training oracle
        if len(stack) >= 2:
            if len(stack) >= 3:
    


SyntaxError: incomplete input (621502281.py, line 16)