In [13]:
# !python -m spacy download en_core_web_sm

In [4]:
# !pip install spacy

In [16]:
# !pip install sklearn_crfsuite

In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from nltk import pos_tag
import spacy
from spacy import displacy
import sklearn_crfsuite

In [2]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [4]:
import os
os.getcwd()

'/Users/spenno_fr/Projects/nlp-glg-mas/notebooks'

In [5]:
NER_MODEL_PATH = '/Users/spenno_fr/Projects/nlp-glg-mas/models/ner_crf/'
filename = '0.1-maf-crf_ner_model.sav'
#pickle.dump(crf, open(filename, 'wb'))

crf = pickle.load(open(NER_MODEL_PATH + filename, 'rb'))

In [6]:
def prep_query(phrase):
    split_query = re.findall(r"[\w']+|[.,!?;]", phrase)
    
    pos_tags = pos_tag(split_query)
    
    df_query = pd.DataFrame({'Sentence #':['Sentence: 1'] * len(pos_tags),
                            'Word':[pair[0] for pair in pos_tags],
                            'POS':[pair[1] for pair in pos_tags],
                            'Tag':[None] * len(pos_tags)})
       
    return df_query

In [106]:
# s = "Donald Trump is a former host on The Apprentice. He is an American businessman and former President."
# s = 'hello how are you'
# s = 'The Second World War started in 1914 and ended in 1918'
# s = 'The Korean War started in 1939 and ended in 1945'
# s = 'Iraq and Iran were once at war. Saddam Hussein was involved'
# s = 'The World Cup is a quadrennial sporting event. FIFA is the governing body involved.'
# s = 'Biden under pressure over Afghanistan and Covid as approval ratings slide'
# s = 'But the Taliban warned on Monday there would be “consequences” if the US and its allies linger beyond that date.'
# s = 'Thousands of American troops have poured back into the country to oversee the chaotic airlift of foreigners and \
# selected Afghans from Kabul airport, and Biden is being called upon to extend a 31 August deadline for full US withdrawal'
# s = 'As it leaves Afghanistan in chaos, America’s decline mirrors Britain’s a century ago. It may also invite wider \
# conflict, warns a historian'
# s = 'In March the joint study reported that it was “extremely unlikely” that the virus had been released in a laboratory \
# accident. Dr Ben Embarek revealed that this conclusion did not come from a balanced assessment of all the relevant \
# evidence but from a steadfast refusal by the Chinese members of the joint study to support anything stronger.'

# Test Cases for Presentation:
# s = 'SoftBank needs a Plan B. One year on, prospects for its planned $54bn sale of UK chip designer Arm to Nvidia are souring. Antitrust watchdogs are circling. The European Commission is set to launch a formal competition probe while the UK’s Competition and Markets Authority has dismissed Nvidia’s efforts as insufficient. China, where Arm has its own problems with a rogue joint venture partner, is likely to join the chorus.The deal is far from dead. A handful of Arm clients have rallied round Nvidia. South Korea’s Samsung provides precedent for vertical integration without compromising access to third party clients. Contractual remedies may save the day.'
s = 'In Palermo, Sicily’s capital, 80 percent of the hospitalized Covid patients are unvaccinated, and a vast majority of those in the I.C.U. have not received a vaccine, said Dr. Renato Costa, the city’s Covid emergency commissioner. Similar rates are observed throughout the region.“If we had a higher vaccination rate,” said Dr. Costa, “our hospitals would be emptier.”Local doctors said the drop in vaccination rates during the month of August was related to the summer holidays, a time when it is more difficult to distribute shots to the region, which has among Italy’s lowest income and education levels.'

x = prep_query(s)

### Preprocess the input query into a cleaned sequence of words

In [107]:
getter_query = SentenceGetter(x)
sentences_query = getter_query.sentences

X_query = [sent2features(s) for s in sentences_query]
X_words = [s[0] for s in sentences_query[0]]

### Predict the Entities using CRFSuite's Trained NER Model

In [108]:
pred = crf.predict(X_query)

### Parse the Predicted Entities into a list of tuples pairing the recognized entities with their original word

In [109]:
ents = list(zip(pred[0], X_words))

In [110]:
# Check Output:
ents[:10]

[('O', 'In'),
 ('B-geo', 'Palermo'),
 ('O', ','),
 ('B-geo', 'Sicily'),
 ('O', 's'),
 ('O', 'capital'),
 ('O', ','),
 ('O', '80'),
 ('O', 'percent'),
 ('O', 'of')]

### Filter to only include the entities which have valid (non-O) tags

In [111]:
named_ents = [pair for pair in ents if pair[0] != 'O']

In [112]:
# Check Output:
named_ents

[('B-geo', 'Palermo'),
 ('B-geo', 'Sicily'),
 ('B-org', 'Covid'),
 ('B-org', 'C'),
 ('B-org', 'U'),
 ('B-geo', 'Dr'),
 ('B-per', 'Renato'),
 ('I-per', 'Costa'),
 ('B-geo', 'Dr'),
 ('B-geo', 'Costa'),
 ('B-tim', 'of'),
 ('I-tim', 'August'),
 ('B-tim', 'summer'),
 ('B-geo', 'Italy')]

In [30]:
# import en_core_web_sm
# nlp = en_core_web_sm.load()

In [12]:
nlp = spacy.load('en_core_web_sm')
# docs = []
# for text, annot in TRAIN_DATA:
#     doc = nlp(text)
#     tags = biluo_tags_from_offsets(doc, annot['entities'])
#     # then convert L->I and U->B to have IOB tags for the tokens in the doc

In [25]:
# X_words

In [22]:
# words = []
# for tag, word in ents:
#     words.append(word)

In [78]:
# doc = nlp(s)

In [26]:
# help(displacy.render)

Help on function render in module spacy.displacy:

render(docs: Union[Iterable[Union[spacy.tokens.doc.Doc, spacy.tokens.span.Span]], spacy.tokens.doc.Doc, spacy.tokens.span.Span], style: str = 'dep', page: bool = False, minify: bool = False, jupyter: Optional[bool] = None, options: Dict[str, Any] = {}, manual: bool = False) -> str
    Render displaCy visualisation.
    
    docs (Union[Iterable[Doc], Doc]): Document(s) to visualise.
    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    jupyter (bool): Override Jupyter auto-detection.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    RETURNS (str): Rendered HTML markup.
    
    DOCS: https://spacy.io/api/top-level#displacy.render
    USAGE: https://spacy.io/usage/visualizers



In [87]:
# Attempt 
# displacy.render(doc, jupyter=True, style = 'ent')

In [88]:
# help(doc.char_span)

In [89]:
# # import spacy
# # from spacy import displacy

# nlp = spacy.blank('en')
# raw_text = "The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."

# doc = nlp.make_doc(raw_text)
# spans = [[4, 10, "POS"], [838, 853, "ORG"]]   #[870, 888, "POS"], [892, 920, "ORG"], [925, 929, "ENGLEVEL"],          [987, 1002, "SKILL"]
# ents = []
# for span_start, span_end, label in spans:
#     print(span_start, span_end, label)

#     ent = doc.char_span(span_start, span_end, label=label)
    
#     print(ent)
#     if ent is None:
#         continue

#     ents.append(ent)

# doc.ents = ents
# displacy.render(doc, style="ent", jupyter=True)

In [90]:
# len(raw_text)

In [91]:
# doc

In [92]:
# spacy.util.get_words_and_spaces(words, s)

In [121]:
named_ent_tags = [tag.upper() for tag, word in ents]

In [122]:
# Check Output
named_ent_tags[:10]

['O', 'B-GEO', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'O']

In [115]:
help(spacy.util.get_words_and_spaces)

Help on function get_words_and_spaces in module spacy.util:

get_words_and_spaces(words: Iterable[str], text: str) -> Tuple[List[str], List[bool]]
    Given a list of words and a text, reconstruct the original tokens and
    return a list of words and spaces that can be used to create a Doc. This
    can help recover destructive tokenization that didn't preserve any
    whitespace information.
    
    words (Iterable[str]): The words.
    text (str): The original text.
    RETURNS (Tuple[List[str], List[bool]]): The words and spaces.



In [116]:
# Check Original Input String
s

'In Palermo, Sicily’s capital, 80 percent of the hospitalized Covid patients are unvaccinated, and a vast majority of those in the I.C.U. have not received a vaccine, said Dr. Renato Costa, the city’s Covid emergency commissioner. Similar rates are observed throughout the region.“If we had a higher vaccination rate,” said Dr. Costa, “our hospitals would be emptier.”Local doctors said the drop in vaccination rates during the month of August was related to the summer holidays, a time when it is more difficult to distribute shots to the region, which has among Italy’s lowest income and education levels.'

In [117]:
# spacy.training.iob_utils.biluo_tags_to_spans

### Create a SpaCy Doc using the custom tags from CRFSuite and the original sequence of input words

In [123]:
from spacy.tokens import Doc
doc = Doc(nlp.vocab, words= X_words, ents = named_ent_tags)

In [124]:
# X_words

In [125]:
displacy.render(doc, style = "ent", jupyter = False)

'<div class="entities" style="line-height: 2.5; direction: ltr">In \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Palermo\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">GEO</span>\n</mark>\n , \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Sicily\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">GEO</span>\n</mark>\n s capital , 80 percent of the hospitalized \n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Covid\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem

In [75]:
# [ent for ent in named_ent_tags if len(ent) > 1]