In [1]:
from mention_extraction import MentionDetector


extractor = MentionDetector()

text_en = 'Victoria Chen, CFO of Megabucks Banking, saw her pay jump to $2.3 million, as the 38-year-old also became the company’s president. It is widely known that she came to Megabucks from rival Lotsabucks.'
text_slo = 'Metka Zahovič je šla v Mercator po njene kosmiče ter ona je mrtva.'

mentions = extractor.get_mentions(text_slo, 'slovenian')
mentions

[('Victoria', 'NNP'), ('Chen', 'NNP'), (',', ','), ('CFO', 'NNP'), ('of', 'IN'), ('Megabucks', 'NNP'), ('Banking', 'NNP'), (',', ','), ('saw', 'VBD'), ('her', 'PRP'), ('pay', 'VB'), ('jump', 'NN'), ('to', 'TO'), ('$', '$'), ('2.3', 'CD'), ('million', 'CD'), (',', ','), ('as', 'IN'), ('the', 'DT'), ('38-year-old', 'JJ'), ('also', 'RB'), ('became', 'VBD'), ('the', 'DT'), ('company', 'NN'), ('’', 'NNP'), ('s', 'NN'), ('president', 'NN'), ('.', '.')]
['her']


[('It', 'PRP'), ('is', 'VBZ'), ('widely', 'RB'), ('known', 'VBN'), ('that', 'IN'), ('she', 'PRP'), ('came', 'VBD'), ('to', 'TO'), ('Megabucks', 'NNS'), ('from', 'IN'), ('rival', 'JJ'), ('Lotsabucks', 'NNP'), ('.', '.')]
['It', 'she']




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\matic\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[['Victoria Chen',
  'Megabucks Banking',
  'pay jump',
  'company ’ s president',
  'her',
  'CFO'],
 ['It', 'she', 'Megabucks', 'Lotsabucks']]

In [2]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import pandas as pd
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


# Defining a grammar & Parser
NP = "NP: {(<V\w+>|<NN\w?>)+.*<NN\w?>}"
chunker = RegexpParser(NP)

def get_continuous_chunks(text, chunk_func=ne_chunk):
    chunked = chunk_func(pos_tag(word_tokenize(text)))
    print(chunked)
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

get_continuous_chunks('Victoria Chen, CFO of Megabucks Banking, saw her pay jump to $2.3 million, as the 38-year-old also became the company’s president. It is widely known that she came to Megabucks from rival Lotsabucks.', chunker.parse)

(S
  (NP Victoria/NNP Chen/NNP)
  ,/,
  CFO/NNP
  of/IN
  (NP Megabucks/NNP Banking/NNP)
  ,/,
  saw/VBD
  her/PRP
  (NP pay/VB jump/NN)
  to/TO
  $/$
  2.3/CD
  million/CD
  ,/,
  as/IN
  the/DT
  38-year-old/JJ
  also/RB
  became/VBD
  the/DT
  (NP company/NN ’/NNP s/NN president/NN)
  ./.
  It/PRP
  is/VBZ
  widely/RB
  known/VBN
  that/IN
  she/PRP
  came/VBD
  to/TO
  Megabucks/NNS
  from/IN
  rival/JJ
  Lotsabucks/NNP
  ./.)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\matic\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['Victoria Chen', 'Megabucks Banking', 'pay jump', 'company ’ s president']