In [1]:
import numpy as np


In [6]:
def process_data(dataname):
    #reads in files, produces data structure with all actions
        #does so by applying produce_rule_list to every sentence.
        #for loop that sets actions to empty, calls p_r_l giving it
        #the stack and buffer, actions and correct_parse, adds finished action list
        #to new data file, for each sentence in the input data
    #input: name of the data file with all parses. Run with data file in same directory.
    #output: data file with all actions
    file = open(dataname)
    data = file.read()
    correct_parses = correct_parse_list(data)
    #gets rid of final whitespace
    del correct_parses[len(correct_parses)-1]
    
    #iterates over all parses, producing action list for each
    complete_rule_list = []
    for sentence_parse in correct_parses:
        stack = []
#         print(len(sentence_parse))
        buff = list(range(1,len(sentence_parse)+1))
        actions = []
        rule_list = produce_rule_list(stack, buff, actions, sentence_parse)
        complete_rule_list.append(rule_list)

    
    return complete_rule_list

def correct_parse_list(data):
    #Turns data into a list of lists of lists with relevant information
    correct_parse = data.split("\n\n")
    for index, paragraph in enumerate(correct_parse):
        correct_parse[index] = paragraph.split("\n")
    for paragraph in correct_parse:
        for index, line in enumerate(paragraph):
            paragraph[index] = line.split("\t")
    return correct_parse



In [7]:
def produce_rule_list(stack, buff, actions, sentence_parse):
    #recursive function that works through words in the sentence (stack/buffer)
        #until only one word is left, creating the list of actions 
        #that was taken to parse it.
    #input: stack, buffer, actions, correct parse
    #output: actions with the actions taken for each buff/stack configuration
    
    #base case
    if len(stack) == 1 and len(buff) == 0:
        #actions.append([stack[:], "empty", "R_arc"])
        actions.append([stack[0],-1, -1, "R_root"])
        return actions

    #If enough of the sentence is still left:
    #If there is not enough material in the stack, shift:
    if len(stack) == 0 :
        #print('chose S - small stack')
        actions.append([-1,-1,buff[0], "S"])
        stack.append(buff[0])
        del buff[0]        
        return produce_rule_list(stack,buff,actions,sentence_parse)
    if len(stack) == 1:
        actions.append([-1,stack[-1],buff[0], "S"])
        stack.append(buff[0])
        del buff[0]
        return produce_rule_list(stack,buff,actions,sentence_parse)
    #If there are 2 or more words in the stack, decide which action to perform and perform it
    if len(stack) > 1:
        action = rule_decision(stack,buff,sentence_parse)
        stack, buff, actions = action(stack,buff,actions, sentence_parse)
        return produce_rule_list(stack,buff,actions,sentence_parse)
    

def rule_decision(stack, buff, sentence_parse):
    #determines which action to apply
    #input: words on stack, words on buff, correct parse
    #output: one of three methods, Shift(), L_arc(), R_arc()

    #TODO: find ids/heads (index [6]) from stack and sentence_parse
    s1 = stack[-2]
    head_of_s1 = int(sentence_parse[s1-1][6])
    s2 = stack[-1]
    head_of_s2 = int(sentence_parse[s2-1][6])
    
    #L arcs can always be applied if possible
    if head_of_s1 == s2:
        action = L_arc
        #print('chose L')
    else:
        #R arcs can only be applied if there is no word in the buffer which has the last word in the stack as a head
        if head_of_s2 == s1:
            buff_heads = [int(sentence_parse[x-1][6]) for x in buff]
            if s2 in buff_heads:
                action = Shift
                #print('chose S - s2 in buffheads')
            else:
                action = R_arc
                #print('chose R')
        #if there is no match between s1 and s2, simply shift another word from the buffer
        else:
            action = Shift
            #print('chose S - no matching s1s2')

    return action

#The following methods perform an arc or shift. These can be changed if more data is needed in the network.

def L_arc(stack, buff, actions, sentence_parse):
    #removes second to last item from stack, writes action to actions
    #input: stack and actions
    #output: new stack and actions with one L_arc line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    relation = sentence_parse[s1-1][7]
    #actions.append([stack[:], buff[:], "L_arc"])
    actions.append([s1,s2,b1, "L"+"_"+relation])
    del stack[-2]
    return stack, buff, actions



def R_arc(stack, buff, actions, sentence_parse):
    #removes last item from the stack, writes action to actions
    #input: stack and actions
    #output: new stack and actions with one R_arc line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    relation = sentence_parse[s2-1][7]
    #actions.append([stack[:], buff[:], "R_arc"])
    actions.append([s1,s2,b1, "R"+"_"+relation])
    del stack[-1]
    return stack, buff, actions



def Shift(stack, buff, actions, sentence_parse):
    #moves an item from the buff to the stack, writes action to actions
    #input: stack, buff and actions
    #output: new stack and actions with one extra shift line
    #s1, s2, b1, action
    s1 = int(stack[-2])
    s2 = int(stack[-1])
    b1 = int(stack[0])
    #actions.append([stack[:], buff[:], "Shift"])
    actions.append([s1,s2,b1, "S"])
    stack.append(buff[0])
    del buff[0]
    return stack, buff, actions

In [8]:

action_data = process_data('data/train-stanford-raw.conll')

In [9]:
for action in action_data[-1]:
    print(action)

[-1, -1, 1, 'S']
[-1, 1, 2, 'S']
[1, 2, 1, 'S']
[2, 3, 1, 'L_aux']
[1, 3, 1, 'L_nsubj']
[-1, 3, 4, 'S']
[3, 4, 3, 'R_iobj']
[-1, 3, 5, 'S']
[3, 5, 3, 'S']
[5, 6, 3, 'L_det']
[3, 6, 3, 'S']
[6, 7, 3, 'S']
[7, 8, 3, 'L_aux']
[6, 8, 3, 'S']
[8, 9, 3, 'S']
[9, 10, 3, 'L_det']
[8, 10, 3, 'R_dobj']
[6, 8, 3, 'S']
[8, 11, 3, 'R_cc']
[6, 8, 3, 'S']
[8, 12, 3, 'S']
[12, 13, 3, 'L_advmod']
[8, 13, 3, 'S']
[13, 14, 3, 'S']
[14, 15, 3, 'S']
[15, 16, 3, 'L_amod']
[14, 16, 3, 'L_det']
[13, 16, 3, 'R_dobj']
[8, 13, 3, 'R_conj']
[6, 8, 3, 'R_infmod']
[3, 6, 3, 'R_dobj']
[-1, 3, 17, 'S']
[3, 17, 3, 'R_punct']
[3, -1, -1, 'R_root']


In [10]:
print(len(action_data))

39832
