In [1]:
import stanza

In [2]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-04-27 21:36:46 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

  return torch._C._cuda_getDeviceCount() > 0
2022-04-27 21:36:46 INFO: Use device: cpu
2022-04-27 21:36:46 INFO: Loading: tokenize
2022-04-27 21:36:46 INFO: Loading: pos
2022-04-27 21:36:47 INFO: Loading: constituency
2022-04-27 21:36:48 INFO: Done loading processors!


In [9]:
def find_head(tree):
    '''
    Get head of a Noun Phrase. Based on a simplified version of Michael Collins' 1999 rules.
    Returns the maximal sequence of noun-tagged words, instead of a 1-word head.
    Cf https://stackoverflow.com/questions/32654704/finding-head-of-a-noun-phrase-in-nltk-and-stanford-parse-according-to-the-rules
    '''

    # Get the first NP
    while(tree.label != 'NP'):
        tree = tree.children[0]
    
    # Break compound sentence
    while(tree.children[0].label == 'NP'):
        tree = tree.children[0]
    

    if(tree.children[-1].label == 'POS'):
        return tree.children[-1]
    
    head = []
    for child in tree.children[::-1]:

        if(child.label in ['NN', 'NNS', 'NNP', 'NNPS', 'NNS', 'POS', 'JJR']):
            head += child.leaf_labels()
        elif(child.label in ['NML']):
            head += child.leaf_labels()[::-1]
        elif(head):
            break
    
    if head:
        return ' '.join(head[::-1])

    for child in tree.children:
        if(child.label in ['ADJP', 'PRN']):
            return child.leaf_labels()[0]

    for child in tree.children:
        if(child.label in ['CD']):
            return child.leaf_labels()[0]

    for child in tree.children:
        if(child.label in ['JJ', 'JJS', 'RB', 'QP']):
            return child.leaf_labels()[0]
    
    return tree.children[-1].leaf_labels()[0]
    

In [12]:
tests= ['Films shot in New Jersey', 'American comedy films', '1978 debut albums', 'Allegories of humility',
        '1983 in water transport in Japan', 'Civil parishes in North Yorkshire', 'The New Orleans Bee October 1908',
        'Socata TB-10 at Flugplatz Uetersen', 'Portrait paintings by Alexei Harlamov', 'Family portraits of Russia',
        'Maria Pavlovna Abamelik-Lazareva (Demidova)',  'Historical photographs of Kano', 'Maps of Vigneux-sur-Seine', 
        'California Digital Library', 'Old books from American Libraries', 'Schlosspark Vösendorf']


for test in tests:
    doc = nlp(test)
    tree = doc.sentences[0].constituency
    print(test, '\t- head: ' + find_head(tree))

Films shot in New Jersey 	- head: Films
American comedy films 	- head: comedy films
1978 debut albums 	- head: debut albums
Allegories of humility 	- head: Allegories
1983 in water transport in Japan 	- head: 1983
Civil parishes in North Yorkshire 	- head: parishes
The New Orleans Bee October 1908 	- head: New Orleans Bee October
Socata TB-10 at Flugplatz Uetersen 	- head: Socata TB
Portrait paintings by Alexei Harlamov 	- head: Portrait paintings
Family portraits of Russia 	- head: Family portraits
Maria Pavlovna Abamelik-Lazareva (Demidova) 	- head: Lazareva
Historical photographs of Kano 	- head: photographs
Maps of Vigneux-sur-Seine 	- head: Maps
California Digital Library 	- head: Library
Old books from American Libraries 	- head: books
Schlosspark Vösendorf 	- head: Schlosspark Vösendorf


In [11]:
nlp('Socata TB-10 at Flugplatz Uetersen').sentences[0].constituency

(ROOT (NP (NP (NNP Socata) (NNP TB) (, -) (CD 10)) (PP (IN at) (NP (NNP Flugplatz) (NNP Uetersen)))))