In [1]:
import stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize, mwt, pos, lemma, depparse')

2021-09-14 21:35:41 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-09-14 21:35:41 INFO: Use device: gpu
2021-09-14 21:35:41 INFO: Loading: tokenize
2021-09-14 21:35:44 INFO: Loading: pos
2021-09-14 21:35:44 INFO: Loading: lemma
2021-09-14 21:35:44 INFO: Loading: depparse
2021-09-14 21:35:44 INFO: Done loading processors!


In [2]:
text = '''Some U.S. allies are complaining that President Bush is pushing conventional-arms talks too quickly, creating a risk that negotiators will make errors that could affect the security of Western Europe for years.'''
def basic_mentions(text):
    doc = nlp(text)
    mentions  = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.pos in ['PRON', 'NOUN', 'PROPN', 'NUM']:
                mentions.append({'word':word.text,\
                                 'parent':sentence.words[word.head-1].text if word.head > 0 else "root",
                                 'parent_index':word.head,
                                 'deprel':word.deprel,
                                 'pos':word.pos,\
                                 'word_index':word.id,\
                                 'start_char':word.start_char,\
                                 'end_char':word.end_char,\
                                 'head':word.id})
    return mentions

In [None]:
basic_mentions = basic_mentions(text)

In [12]:
from collections import defaultdict
class DepGraph:
    def __init__(self, text):
        self.graph = defaultdict(dict)
        self.paths = []
        self.text = text
        self.doc = nlp(text)
        self.word_dict = {}
        self.pos_dict = {}
        self.char_span_dict = {}
        self.mentions = []
        self.buildgraph()
    
    def addEdge(self, u, v, rel):
        '''adds edge u->v with relation rel'''
        if(u in self.graph.keys()):
            self.graph[u][v] = (rel)
        else:
            self.graph[u] = {} 
            self.graph[u][v] = (rel)
            
    def addinfo(self, u, word, pos, start, end):
        self.word_dict[u] = word
        self.pos_dict[u] = pos
        self.char_span_dict[u] = [start, end]
        
    def buildgraph(self):
        for sentence in self.doc.sentences:
            for word in sentence.words:
                parent_index = word.head
                word_index = word.id
                relation = word.deprel
                self.addEdge(parent_index, word_index, relation)
                self.addinfo(word_index, word.text, word.pos, word.start_char, word.end_char)
        self.find_paths()
        
    def find_paths(self):
        self.paths = []
        self.depthFirst(0, -1, [])
        list_of_paths = self.paths
        self.paths = {}
        for p in list_of_paths:
            self.paths[p[-1]]  = p
        
    def depthFirst(self, currentVertex, previousVertex, visited):
        visited.append(currentVertex)
        for neighbour in self.graph[currentVertex]:
            if neighbour not in visited:
                self.depthFirst(neighbour, currentVertex, visited.copy())
        self.paths.append(visited)
        
    def noun_phrase(self, currentVertex, visited):
        visited.append(currentVertex)
        
        if len(self.graph[currentVertex])==0:
            start_span = self.char_span_dict[currentVertex][0]
            end_span = self.char_span_dict[currentVertex][1]
            return [start_span, end_span]
        
        start = self.char_span_dict[currentVertex][0]
        end = self.char_span_dict[currentVertex][1]
        for child in self.graph[currentVertex].keys():
            if child not in visited:
                relation = self.graph[currentVertex][child]
                if relation in ['compound', 'flat', 'fixed', 'det', 'amod', 'conj']:
                    [start_new, end_new] = self.noun_phrase(child, visited.copy())
                    start = min(start_new, start)
                    end = max(end_new, end)  
        return [start, end]
            
    def get_head_word(self, mention):
        '''mention: text span of mention
           g: dependency graph of entire sentence (of which mention is a sub-span) with paths of each node from root
           returns: head word token and its token id in the input_text'''

        m_doc = nlp(mention)
        min_path_len = 1e10
        lca = len(m_doc.sentences[-1].words) #last word of mention
        lca_text = m_doc.sentences[-1].words[-1].text #last word of mention

        mention_words = [w.text for w in m_doc.sentences[-1].words]
        print(mention_words)
        for sent in self.doc.sentences:
            for word in sent.words:
                if word.text not in mention_words:
                    continue
                path = self.paths[word.id]
                print(word.text, path)
                if len(path)<min_path_len:
                    min_path_len = len(path)
                    lca = word.id
                    lca_text = word.text
        return lca_text, lca
            
    def deduplicate(self, mentions):
        mention_heads = {}
        for m in mentions:
            head = self.get_head_word(m)
            if head in mention_heads.keys():
                mention_heads[head].append(m)
            else:
                mention_heads[head] = [m]
        mentions = [max(v, key = len) for v in mention_heads.values()]
        return mentions
    
    def find_mentions(self):
        mentions = []
        for sentence in self.doc.sentences:
            for word in sentence.words:
                if word.pos not in ['PRON', 'NOUN', 'PROPN', 'NUM']:
                    continue
                mention_char_span = self.noun_phrase(word.id, [])
                if(len(mention_char_span)!=0):
                    mentions.append(text[mention_char_span[0]:mention_char_span[1]])
        mentions = self.deduplicate(mentions)
        return list(set(mentions))

In [13]:
text = '''Even Mao Tse-tung's China began in 1949 with a partnership between the communists and a number of smaller, non-communist parties.'''
g = DepGraph(text)
g.find_mentions()

['Mao', 'Tse-tung']
Mao [0, 6, 5, 2]
Tse-tung [0, 6, 5, 2, 3]
['Tse-tung']
Tse-tung [0, 6, 5, 2, 3]
['China']
China [0, 6, 5]
['1949']
1949 [0, 6, 8]
['a', 'partnership']
a [0, 6, 11, 10]
partnership [0, 6, 11]
a [0, 6, 11, 14, 17, 16]
['the', 'communists', 'and', 'a', 'number']
a [0, 6, 11, 10]
the [0, 6, 11, 14, 13]
communists [0, 6, 11, 14]
and [0, 6, 11, 14, 17, 15]
a [0, 6, 11, 14, 17, 16]
number [0, 6, 11, 14, 17]
['a', 'number']
a [0, 6, 11, 10]
a [0, 6, 11, 14, 17, 16]
number [0, 6, 11, 14, 17]
['smaller', ',', 'non-communist', 'parties']
smaller [0, 6, 11, 14, 17, 22, 19]
, [0, 6, 11, 14, 17, 22, 20]
non-communist [0, 6, 11, 14, 17, 22, 21]
parties [0, 6, 11, 14, 17, 22]


['1949',
 'smaller, non-communist parties',
 'China',
 'Tse-tung',
 'a partnership',
 'the communists and a number',
 'Mao Tse-tung']

In [14]:
g.get_head_word('Mao Tse-tung')

['Mao', 'Tse-tung']
Mao [0, 6, 5, 2]
Tse-tung [0, 6, 5, 2, 3]


('Mao', 2)

In [18]:
for idx in [6, 5, 2, 3]:
    print(g.word_dict[idx])

began
China
Mao
Tse-tung


In [15]:
g.paths

{1: [0, 6, 5, 2, 1],
 3: [0, 6, 5, 2, 3],
 4: [0, 6, 5, 2, 4],
 2: [0, 6, 5, 2],
 5: [0, 6, 5],
 7: [0, 6, 8, 7],
 8: [0, 6, 8],
 9: [0, 6, 11, 9],
 10: [0, 6, 11, 10],
 12: [0, 6, 11, 14, 12],
 13: [0, 6, 11, 14, 13],
 15: [0, 6, 11, 14, 17, 15],
 16: [0, 6, 11, 14, 17, 16],
 18: [0, 6, 11, 14, 17, 22, 18],
 19: [0, 6, 11, 14, 17, 22, 19],
 20: [0, 6, 11, 14, 17, 22, 20],
 21: [0, 6, 11, 14, 17, 22, 21],
 22: [0, 6, 11, 14, 17, 22],
 17: [0, 6, 11, 14, 17],
 14: [0, 6, 11, 14],
 11: [0, 6, 11],
 23: [0, 6, 23],
 6: [0, 6],
 0: [0]}

In [None]:
text = '''Bob, John, and Mary saw him.'''
g = DepGraph(text)
g.find_mentions()

In [None]:
text = '''Some U.S. allies are complaining that President Bush is pushing conventional-arms talks too quickly, creating a risk that negotiators will make errors that could affect the security of Western Europe for years.'''
g = DepGraph(text)
g.find_mentions()

In [None]:
text = '''During the third quarter, Compaq purchased a former Wang Laboratories manufacturing facility in Sterling, Scotland, which will be used for international service and repair operations.'''
g = DepGraph(text)
g.find_mentions()

In [None]:
text = '''The government has other agencies and instruments for pursuing these other objectives.'''
g = DepGraph(text)
g.find_mentions()

In [None]:
text = '''happy campus president of Mass'''
g = DepGraph(text)
g.find_mentions()

In [6]:
g.get_head_word('Tse-tung')

('Tse-tung', 3)

In [None]:
g.noun_phrase(1, [])

In [None]:
text = '''Bob, John, Mary saw him.'''
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')
doc = nlp(text)
print(*[f'id: {word.id}\tword: {word.text}\t\tpos: {word.pos} \t\thead id: {word.head}\t\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')