In [1]:
import stanza
import pandas as pd
from nltk.tree import *
from stanza.models.constituency.tree_reader import read_trees

In [2]:
#import word timestamps (extracted from Praat)

path_to_data = 'Data/example_contracted_english.csv'

df = pd.read_csv(path_to_data)
df = df[~df.text.isna()].reset_index(drop = True)

In [4]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')

2024-12-06 16:25:36 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-12-06 16:25:36 INFO: Downloaded file to /Users/test/stanza_resources/resources.json
2024-12-06 16:25:36 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2024-12-06 16:25:36 INFO: Using device: cpu
2024-12-06 16:25:36 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-12-06 16:25:36 INFO: Loading: mwt
2024-12-06 16:25:36 INFO: Loading: pos
2024-12-06 16:25:37 INFO: Loading: constituency
2024-12-06 16:25:37 INFO: Done loading processors!


In [5]:
#extract sentence from timestamps
text = ' '.join(w for w in df.text)

doc = nlp(text)

In [6]:
text

"I hadn't thought that there was too much you didn't like eating yesterday at my parents'"

In [7]:
def syntactic_annotation(tree, words):
    '''
    tree is a constituency object coming from stanza. You can also import a string instead of the object.

    words is a word object coming from stanza. Alternatively you can pass a list of words

    Example
    
    import stanza
    import pandas as pd
    from nltk.tree import *
    *sentence is a string
    doc =nlp(sentence)
    
    e.g., 
    tree = doc.sentences[j].constituency # j is the indext of the sentence to analyse
    words = doc.sentences[j].words
    '''
    
    #check if the input is a string or not
    if isinstance(tree, str): 

        tree = read_trees(tree)[0]
    
    

    #import tree into nltk
    tree_string = Tree.fromstring(str(tree))
    
    #get indexes for every terminal node
    tpos = tree_string.treepositions('leaves')
    
    
    #initialize list to store phrase extraction
    #in this case, it will be a list of nexted lists
    
    all_labels = list()
    
    
    #loop through the idx to get to every leaf
    for leaf_idx in tpos: 
        
        #initialize function to be used in Stanza
        layer = tree.children
        
        #list of labels for this leaf, to nest into the other list
        labels = []
        


        #loop through current leaf_idx
        for n, i in enumerate(leaf_idx): 
            
            #add current index
            layer = layer[i]
            
            #extract labels
            label = layer.label

            #add current level of embedding
            label_n = '/'
            for idx in list(leaf_idx)[:n+1]: 
                label_n += str(idx) 
            
            #add identifier
            label += label_n
            
            #store label
            labels.append(label)
            
            #prepare for next iteration
            layer = layer.children

            #function_string += '.children'
        
        all_labels.append(labels)
    
    #store in a df and add words    
    df = pd.DataFrame(all_labels)
    
    #chech if words is a list of strings
    if isinstance(words[0], str):

        df.insert(loc=0, column='words', value=words)
        
        
    else: 
        words = [w.text for w in words]
        df.insert(loc=0, column='words', value=words) 
        
        
    return df


In [8]:
#get tree object and word object

tree = doc.sentences[0].constituency
words = doc.sentences[0].words


In [9]:
#run the function
dfsyntax = syntactic_annotation(tree, words)

In [10]:
# define contracted forms in English

contracted_form = [
    "'s", 
    "'m",
    "'re",
    "'ve",
    "'d",
    "'ll",
    "n't", 
    "'"
    ]


In [11]:
#convert words into text 
#note the these words are already splitted by the tokenizer
words = [w.text for w in words]
words

['I',
 'had',
 "n't",
 'thought',
 'that',
 'there',
 'was',
 'too',
 'much',
 'you',
 'did',
 "n't",
 'like',
 'eating',
 'yesterday',
 'at',
 'my',
 'parents',
 "'"]

In [12]:
#get the index of the word to skip
#the word to skip corresponds to the line we want to skip from the dataframe when we perform the alignment

idx = []
words_align = [] # this list is only for checking


#loop through words starting from the second item in the list
for i in range(1, len(words)): 

    #check if the current word is a contracted form
    if words[i] in contracted_form: 

        # this is for checking
        contracted_word = words[i-1] + words[i]
        words_align.append(contracted_word)

        #get the index to exclude
        idx.append(i)
    
    else: 
        words_align.append(words[i-1])

In [13]:
#check if it makes sense 
words_align = [w for i,w in enumerate(words_align) if i not in idx ] # this excludes contracted forms

words_align


['I',
 "hadn't",
 'thought',
 'that',
 'there',
 'was',
 'too',
 'much',
 'you',
 "didn't",
 'like',
 'eating',
 'yesterday',
 'at',
 'my',
 "parents'"]

In [14]:
#exclude 
df_clean = dfsyntax[~dfsyntax.index.isin(idx)].reset_index(drop = True)   

#align
dff = pd.concat([df, df_clean], axis = 1)

In [15]:
dff

Unnamed: 0,tmin,tier,text,tmax,words,0,1,2,3,4,...,7,8,9,10,11,12,13,14,15,16
0,0.040956,words,I,0.175635,I,S/0,NP/00,PRP/000,I/0000,,...,,,,,,,,,,
1,0.175635,words,hadn't,0.593454,had,S/0,VP/01,VBD/010,had/0100,,...,,,,,,,,,,
2,0.593454,words,thought,0.899455,thought,S/0,VP/01,VP/012,VBN/0120,thought/01200,...,,,,,,,,,,
3,0.899455,words,that,1.062265,that,S/0,VP/01,VP/012,SBAR/0121,IN/01210,...,,,,,,,,,,
4,1.062265,words,there,1.209133,there,S/0,VP/01,VP/012,SBAR/0121,S/01211,...,there/01211000,,,,,,,,,
5,1.209133,words,was,1.356363,was,S/0,VP/01,VP/012,SBAR/0121,S/01211,...,was/01211100,,,,,,,,,
6,1.356363,words,too,1.655499,too,S/0,VP/01,VP/012,SBAR/0121,S/01211,...,ADJP/01211110,RB/012111100,too/0121111000,,,,,,,
7,1.655499,words,much,2.046591,much,S/0,VP/01,VP/012,SBAR/0121,S/01211,...,ADJP/01211110,JJ/012111101,much/0121111010,,,,,,,
8,2.046591,words,you,2.190692,you,S/0,VP/01,VP/012,SBAR/0121,S/01211,...,SBAR/01211111,S/012111110,NP/0121111100,PRP/01211111000,you/012111110000,,,,,
9,2.190692,words,didn't,2.397127,did,S/0,VP/01,VP/012,SBAR/0121,S/01211,...,SBAR/01211111,S/012111110,VP/0121111101,VBD/01211111010,did/012111110100,,,,,
