In [2]:
import nltk
import os
PATH = '/vol/bitbucket/lst20/lex-eval'
nltk.data.path.append(os.path.join(PATH, 'nltk_data'))  # Add to path if needed
nltk.download('punkt_tab', download_dir=os.path.join(PATH, 'nltk_data'))
nltk.download('averaged_perceptron_tagger_eng', download_dir=os.path.join(PATH, 'nltk_data'))
from nltk.tag import pos_tag 
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser

[nltk_data] Downloading package punkt_tab to /vol/bitbucket/lst20/lex-
[nltk_data]     eval/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /vol/bitbucket/lst20/lex-eval/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [19]:
sample_text = "the yellow cat is sleeping under the christmas tree made of roasted ducks. What a lovely christmas morning."

# sample_text = 'Although she had planned to leave early, knowing that the traffic had been unusually heavy due to ongoing roadworks that the city council had failed to address despite numerous complaints would likely cause delays, she found herself stuck at her desk much to her frustration, , where a last-minute email from her manager who had just returned from a business trip during which he had met with several important clients whose contracts were still pending approval required her immediate attention before she could even think about packing up and heading home.'

In [14]:
# https://www.nltk.org/book_1ed/ch07.html#ref-chunkex-grammar
tokens = word_tokenize(sample_text)
# Find all parts of speech in above sentence
tagged = pos_tag(tokens)

#Extract all parts of speech from any text
chunker = RegexpParser("""
                    NP: {<DT|PP$>?<JJ>*<NN|NNS>*}    # chunk determiner/possessive, adjectives and nouns
                    PP: {<IN><NP>}                   # Chunk prepositions followed by NP
                    VP: {<VB.*><NP|PP|CLAUSE>+$}    # Chunk verbs and their arguments
                    CLAUSE: {<NP><VP>}              # Chunk NP, VP
                    """)

# Print all parts of speech in above sentence
output = chunker.parse(tagged)
print(output, output.height())

(S
  Although/IN
  she/PRP
  had/VBD
  planned/VBN
  to/TO
  leave/VB
  (NP early/JJ)
  ,/,
  knowing/VBG
  (PP that/IN (NP the/DT traffic/NN))
  had/VBD
  been/VBN
  unusually/RB
  (NP heavy/JJ due/JJ)
  to/TO
  ongoing/VBG
  (NP roadworks/NNS)
  (PP that/IN (NP the/DT city/NN council/NN))
  had/VBD
  failed/VBN
  to/TO
  address/VB
  (PP despite/IN (NP numerous/JJ complaints/NNS))
  would/MD
  likely/RB
  cause/VB
  (NP delays/NNS)
  ,/,
  she/PRP
  found/VBD
  herself/PRP
  (NP stuck/NN)
  at/IN
  her/PRP$
  (NP desk/NN)
  (NP much/JJ)
  to/TO
  her/PRP$
  (NP frustration/NN)
  ,/,
  ,/,
  where/WRB
  (NP a/DT last-minute/JJ email/NN)
  from/IN
  her/PRP$
  (NP manager/NN)
  who/WP
  had/VBD
  just/RB
  returned/VBN
  (PP from/IN (NP a/DT business/NN trip/NN))
  during/IN
  which/WDT
  he/PRP
  had/VBD
  met/VBN
  (PP with/IN (NP several/JJ important/JJ clients/NNS))
  whose/WP$
  (NP contracts/NNS)
  were/VBD
  still/RB
  pending/VBG
  (NP approval/NN)
  required/VBD
  her/PRP$
  (

In [16]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_trf")

In [20]:
# Process the text with spaCy
doc = nlp(sample_text)

# Function to compute the height of the parse tree
def compute_tree_height(token):
    if not list(token.children):  # If the token has no children, it's a leaf
        return 1
    else:
        return 1 + max(compute_tree_height(child) for child in token.children)

# Find the ROOT of the parse tree
root = [token for token in doc if token.dep_ == "ROOT"][0]

# Compute height of the parse tree
parse_tree_height = compute_tree_height(root)

# Print parse tree height
print(f"Height of the parse tree: {parse_tree_height}")

Height of the parse tree: 7
