In [8]:
#import all libraries needed for annotation
import string as st
import stanza
import pandas as pd
from nltk.tree import *
from stanza.models.constituency.tree_reader import read_trees

In [10]:
#define model
nlp = stanza.Pipeline(lang= 'en', processors= 'tokenize, mwt, pos, constituency', 
                      use_gpu=(False))

2024-12-06 16:16:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-12-06 16:16:17 INFO: Downloaded file to /Users/test/stanza_resources/resources.json
2024-12-06 16:16:17 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2024-12-06 16:16:17 INFO: Using device: cpu
2024-12-06 16:16:17 INFO: Loading: tokenize
2024-12-06 16:16:17 INFO: Loading: mwt
2024-12-06 16:16:17 INFO: Loading: pos
2024-12-06 16:16:17 INFO: Loading: constituency
2024-12-06 16:16:17 INFO: Done loading processors!


In [22]:
def syntactic_annotation(tree, words):
    '''
    tree is a constituency object coming from stanza. You can also import a string instead of the object.

    words is a word object coming from stanza. Alternatively you can pass a list of words

    Example
    
    import string as st
    import stanza
    import pandas as pd
    from nltk.tree import *
    *sentence is a string
    doc =nlp(sentence)
    
    e.g., 
    tree = doc.sentences[j].constituency # j is the indext of the sentence to analyse
    words = doc.sentences[j].words
    '''
    
    #check if the input is a string or not
    if isinstance(tree, str): 

        tree = read_trees(tree)[0]
    
    

    #import tree into nltk
    tree_string = Tree.fromstring(str(tree))
    
    #get indexes for every terminal node
    tpos = tree_string.treepositions('leaves')
    
    
    #initialize list to store phrase extraction
    #in this case, it will be a list of nexted lists
    
    all_labels = list()
    
    
    #loop through the idx to get to every leaf
    for leaf_idx in tpos: 
        
        #initialize function to be used in Stanza
        layer = tree.children
        
        #list of labels for this leaf, to nest into the other list
        labels = []
        


        #loop through current leaf_idx
        for n, i in enumerate(leaf_idx): 
            
            #add current index
            layer = layer[i]
            
            #extract labels
            label = layer.label

            #add current level of embedding
            label_n = '/'
            for idx in list(leaf_idx)[:n+1]: 
                label_n += str(idx) 
            
            #add identifier
            label += label_n
            
            #store label
            labels.append(label)
            
            #prepare for next iteration
            layer = layer.children

            #function_string += '.children'
        
        all_labels.append(labels)
    
    #store in a df and add words    
    df = pd.DataFrame(all_labels)
    
    #chech if words is a list
    if isinstance(words[0], str):

        df.insert(loc=0, column='words', value=words)
        
        
    else: 
        words = [w.text for w in words]
        df.insert(loc=0, column='words', value=words) 
        
        
    return df


In [25]:
# import dataset from Brennan, 2023.
# available here: https://deepblue.lib.umich.edu/data/concern/data_sets/bg257f92t

path = 'Data/AliceChapterOne-EEG.csv'

dfBrennan = pd.read_csv(path)

#extract the n of sentences in the datagrame
n_sents = set(dfBrennan.Sentence)

In [26]:
dfBrennan 

Unnamed: 0,Word,Segment,onset,offset,Order,LogFreq,LogFreq_Prev,LogFreq_Next,SndPower,Length,Position,Sentence,IsLexical,NGRAM,RNN,CFG
0,Alice,1,0.046000,0.608721,1,8.65,0.00,14.56,3.621500e-07,0.562721,1,1,1.0,3.226499,3.126175,2.312348
1,was,1,0.562721,0.830543,2,14.56,8.65,10.69,3.843500e-09,0.267822,2,1,0.0,0.905229,1.691128,1.357460
2,beginning,1,0.784543,1.302929,3,10.69,14.56,16.35,3.686500e-09,0.518386,3,1,1.0,4.446766,4.100771,5.626722
3,to,1,1.256929,1.398925,4,16.35,10.69,13.79,3.969700e-09,0.141996,4,1,0.0,2.537495,3.833313,5.939201
4,get,1,1.352925,1.662327,5,13.79,16.35,13.28,3.774700e-09,0.309402,5,1,0.0,1.023137,1.013076,2.697304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124,happens,12,45.226353,45.672448,2146,10.77,10.82,13.76,7.081200e-04,0.446095,6,84,1.0,5.574428,6.356812,2.969568
2125,when,12,45.677924,45.891353,2147,13.76,10.77,14.17,3.221100e-03,0.213429,7,84,0.0,4.059164,6.720639,4.930669
2126,one,12,45.896829,46.058972,2148,14.17,13.76,8.15,1.984200e-03,0.162143,8,84,1.0,1.380381,2.187682,0.725398
2127,eats,12,46.064448,46.322373,2149,8.15,14.17,8.74,2.473000e-05,0.257925,9,84,1.0,3.171368,3.941021,2.767965


In [27]:
#initialize empty lists where to store the annotations
f_dfs = []
dfSents = []

#loop for every sentence, instead of doing 1 analysis for all the dataframe
for idx_sent in list(n_sents): 

    #slice the for every sentece
    dfSent = dfBrennan[dfBrennan.Sentence == idx_sent]
    dfSent = dfSent.reset_index(drop = True)

    #extract the words for the current sentence
    current_words = dfSent.Word
    
    sent_to_analyse = [' '.join(w for w in current_words)]

    #store dfSent for comparison with the annotation
    dfSents.append(dfSent)
    
    #analyse sent into a tree 
    
    doc =nlp(sent_to_analyse[0])

    #check for the length of the parsed document. This ensures that 
    #the parsed sentences will later align with the sentences as in Brennan et al's annotations
    if len(doc.sentences) == 1: 

        #parse into a stanza tree and extract current words
        
        tree = doc.sentences[0].constituency # this could be also a string coming from another model
        words = doc.sentences[0].words # this could be a list of words as strings
        
        # run syntactic analysis for the current sentence
        df_current_sent = syntactic_annotation(tree, words)
        
        #insert words for comparison. Note that number of words could be 
        #different in the case of multi-word tokenization
        df_current_sent.insert(loc=1, column='idx_sent', value=[
            'correct' for i in range(len(df_current_sent))]) # correct is to make sure we match original sent index
        df_current_sent = df_current_sent.reset_index()
       

    # in case the parsed doc contains more the one sentence
    if len(doc.sentences) > 1: 

        #store all sentences in the current doc
        storedfs = []

        #loop through the current sentences
        for j, sent in enumerate(doc.sentences):

            #parse into a tree for every sentence
            tree = sent.constituency
            words = sent.words

            #run syntactic analysis
            df_current_sent = syntactic_annotation(tree, words)

            
            df_current_sent.insert(loc=1, column='idx_sent', value=[
                str(idx_sent) + f'_{j+1}' 
                for i in range(len(df_current_sent))]) # append a subindex for the current sentence
            storedfs.append(df_current_sent)
        
        df_current_sent = pd.concat(storedfs)
        df_current_sent = df_current_sent.reset_index(drop = True)
        #df_current_sent = pd.concat([ df_current_sent, dfSent,], axis = 1, ignore_index=True)
        
    f_dfs.append(df_current_sent)
    
    
alldf = pd.concat(f_dfs, ignore_index= True)

# for the purpose of this, it is not need to write any more complex code
alldf = alldf[alldf.words != "'s"].reset_index(drop = True)
allsents = pd.concat(dfSents, ignore_index= True)

#final df
df_aligned = pd.concat([alldf, allsents], axis=1)

In [29]:
df_aligned

Unnamed: 0,index,words,idx_sent,0,1,2,3,4,5,6,...,LogFreq_Prev,LogFreq_Next,SndPower,Length,Position,Sentence,IsLexical,NGRAM,RNN,CFG
0,0.0,Alice,correct,S/0,NP/00,NNP/000,Alice/0000,,,,...,0.00,14.56,3.621500e-07,0.562721,1,1,1.0,3.226499,3.126175,2.312348
1,1.0,was,correct,S/0,VP/01,VBD/010,was/0100,,,,...,8.65,10.69,3.843500e-09,0.267822,2,1,0.0,0.905229,1.691128,1.357460
2,2.0,beginning,correct,S/0,VP/01,VP/011,VP/0110,VBG/01100,beginning/011000,,...,14.56,16.35,3.686500e-09,0.518386,3,1,1.0,4.446766,4.100771,5.626722
3,3.0,to,correct,S/0,VP/01,VP/011,VP/0110,S/01101,VP/011010,TO/0110100,...,10.69,13.79,3.969700e-09,0.141996,4,1,0.0,2.537495,3.833313,5.939201
4,4.0,get,correct,S/0,VP/01,VP/011,VP/0110,S/01101,VP/011010,VP/0110101,...,16.35,13.28,3.774700e-09,0.309402,5,1,0.0,1.023137,1.013076,2.697304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124,5.0,happens,correct,S/0,VP/00,VP/001,ADJP/0011,SBAR/00111,S/001110,VP/0011102,...,10.82,13.76,7.081200e-04,0.446095,6,84,1.0,5.574428,6.356812,2.969568
2125,6.0,when,correct,S/0,VP/00,VP/001,ADJP/0011,SBAR/00111,S/001110,VP/0011102,...,10.77,14.17,3.221100e-03,0.213429,7,84,0.0,4.059164,6.720639,4.930669
2126,7.0,one,correct,S/0,VP/00,VP/001,ADJP/0011,SBAR/00111,S/001110,VP/0011102,...,13.76,8.15,1.984200e-03,0.162143,8,84,1.0,1.380381,2.187682,0.725398
2127,8.0,eats,correct,S/0,VP/00,VP/001,ADJP/0011,SBAR/00111,S/001110,VP/0011102,...,14.17,8.74,2.473000e-05,0.257925,9,84,1.0,3.171368,3.941021,2.767965
