In [1]:
import spacy
import pandas as pd

text_path = "../data/article_with_eof_characters.txt"
outfile = "../data/results_constituency_paths.tsv"

with open(text_path, encoding="utf-8") as f:
    content = f.read()

nlp = spacy.load("en_core_web_sm")
doc = nlp(content)

# https://spacy.io/api/token
token = [tok.text for tok in doc]
dependency = [tok.dep_ for tok in doc]
head = [tok.head for tok in doc]
dependent = [[t.text for t in tok.children] for tok in doc]
constituent = [[t.text for t in tok.subtree] for tok in doc]

parse_info = {"token": token, "dependency": dependency,
              "head": head, "dependent": dependent,
              "constituent": constituent}

df = pd.DataFrame.from_dict(parse_info)



In [2]:
def get_paths(path_list, node, overarching_list):  
    """
    Function that creates a constituency tree path for each word in text.
    """
    # check whether there is a syntax tree
    if path_list == None:
        return
    # if so, append current label
    path_list.append(node.label)
    # once you get to leaf, append path of the leaf
    if len(node.children) == 0:
        # exclude the leaf/word itself and add to overarching list
        overarching_list.append(path_list[:-1])
        # stop function
        return
    for n in node.children:
        # all children need to have same subpath, which is why .copy() is needed
        # keep getting paths until leaf is reached
        get_paths(path_list.copy(), n, overarching_list)
    

In [3]:
import stanza

path_labels= []

stanza_pipeline = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')

with open(text_path, encoding="utf-8") as f:
    for line in f.readlines():
        doc_stanza = stanza_pipeline(line)

        doc_sentences = list(doc_stanza.sentences)

        # for each sentence in the text, get the tree paths for the tokens in the sentence
        for i in range(len(doc_sentences)):
            get_paths([], doc_sentences[i].constituency.children[0], path_labels)

print(path_labels)




2022-02-17 21:26:08 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2022-02-17 21:26:08 INFO: Use device: cpu
2022-02-17 21:26:08 INFO: Loading: tokenize
2022-02-17 21:26:08 INFO: Loading: pos
2022-02-17 21:26:09 INFO: Loading: constituency
2022-02-17 21:26:09 INFO: Done loading processors!


[['S', 'S', 'NP', 'DT'], ['S', 'S', 'NP', 'NNP'], ['S', 'S', 'VP', 'VBZ'], ['S', 'S', 'VP', 'S', 'VP', 'TO'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'VB'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'JJ'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'TO'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'VB'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'NP', 'NP', 'JJS'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'NP', 'PP', 'IN'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'NP', 'PP', 'NP', 'DT'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'NP', 'PP', 'NP', 'VBG'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'NP', 'PP', 'NP', 'NN'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'NP', 'PP', 'NP', 'NNS'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'PP', 'IN'], ['S', 'S', 'VP', 'S', 'VP', 'VP', 'ADJP', 'S', 'VP', 'VP', 'PP', 'NP', 'NP', 'DT'], ['S', 'S', 'VP', 'S', '

In [4]:
# exclude end of line characters
df = df[df["token"] != "\n"]

# add new column to dataframe
df["paths"] = path_labels

df

Unnamed: 0,token,dependency,head,dependent,constituent,paths
0,The,det,Cabinet,[],[The],"[S, S, NP, DT]"
1,Cabinet,nsubj,hopes,[The],"[The, Cabinet]","[S, S, NP, NNP]"
2,hopes,ccomp,said,"[Cabinet, be]","[The, Cabinet, hopes, to, be, able, to, abolis...","[S, S, VP, VBZ]"
3,to,aux,be,[],[to],"[S, S, VP, S, VP, TO]"
4,be,xcomp,hopes,"[to, able, including]","[to, be, able, to, abolish, most, of, the, rem...","[S, S, VP, S, VP, VP, VB]"
...,...,...,...,...,...,...
647,in,prep,withdrawn,[Belgium],"[in, Belgium]","[S, VP, SBAR, S, VP, VP, VP, PP, IN]"
648,Belgium,pobj,in,[],[Belgium],"[S, VP, SBAR, S, VP, VP, VP, PP, NP, NNP]"
649,next,amod,Friday,[],[next],"[S, VP, SBAR, S, VP, VP, VP, NP, JJ]"
650,Friday,npadvmod,withdrawn,[next],"[next, Friday]","[S, VP, SBAR, S, VP, VP, VP, NP, NNP]"
