In [13]:
# !python -m spacy download en_core_web_sm

In [4]:
# !pip install spacy

In [16]:
# !pip install sklearn_crfsuite

In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from nltk import pos_tag
import spacy
from spacy import displacy
import sklearn_crfsuite

In [2]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [4]:
import os
os.getcwd()

'/Users/spenno_fr/Projects/nlp-glg-mas/notebooks'

In [5]:
NER_MODEL_PATH = '/Users/spenno_fr/Projects/nlp-glg-mas/models/ner_crf/'
filename = '0.1-maf-crf_ner_model.sav'
#pickle.dump(crf, open(filename, 'wb'))

crf = pickle.load(open(NER_MODEL_PATH + filename, 'rb'))

In [6]:
def prep_query(phrase):
    split_query = re.findall(r"[\w']+|[.,!?;]", phrase)
    
    pos_tags = pos_tag(split_query)
    
    df_query = pd.DataFrame({'Sentence #':['Sentence: 1'] * len(pos_tags),
                            'Word':[pair[0] for pair in pos_tags],
                            'POS':[pair[1] for pair in pos_tags],
                            'Tag':[None] * len(pos_tags)})
       
    return df_query

In [53]:
s = "Donald Trump is a former host on The Apprentice. He is an American businessman and former President."
s = 'hello how are you'
s = 'The Second World War started in 1914 and ended in 1918'
s = 'The Korean War started in 1939 and ended in 1945'
s = 'Iraq and Iran were once at war. Saddam Hussein was involved'
s = 'The World Cup is a quadrennial sporting event. FIFA is the governing body involved.'
s = 'Biden under pressure over Afghanistan and Covid as approval ratings slide'
s = 'But the Taliban warned on Monday there would be “consequences” if the US and its allies linger beyond that date.'
s = 'Thousands of American troops have poured back into the country to oversee the chaotic airlift of foreigners and \
selected Afghans from Kabul airport, and Biden is being called upon to extend a 31 August deadline for full US withdrawal'
s = 'As it leaves Afghanistan in chaos, America’s decline mirrors Britain’s a century ago. It may also invite wider \
conflict, warns a historian'
s = 'In March the joint study reported that it was “extremely unlikely” that the virus had been released in a laboratory \
accident. Dr Ben Embarek revealed that this conclusion did not come from a balanced assessment of all the relevant \
evidence but from a steadfast refusal by the Chinese members of the joint study to support anything stronger.'


x = prep_query(s)

In [54]:
getter_query = SentenceGetter(x)
sentences_query = getter_query.sentences

X_query = [sent2features(s) for s in sentences_query]
X_words = [s[0] for s in sentences_query[0]]

pred = crf.predict(X_query)

ents = list(zip(pred[0],X_words))
named_ents = [pair for pair in ents if pair[0] != 'O']
named_ents

[('B-tim', 'March'),
 ('B-per', 'Dr'),
 ('I-per', 'Ben'),
 ('I-per', 'Embarek'),
 ('B-gpe', 'Chinese')]

In [30]:
# import en_core_web_sm
# nlp = en_core_web_sm.load()

In [12]:
nlp = spacy.load('en_core_web_sm')
# docs = []
# for text, annot in TRAIN_DATA:
#     doc = nlp(text)
#     tags = biluo_tags_from_offsets(doc, annot['entities'])
#     # then convert L->I and U->B to have IOB tags for the tokens in the doc

In [25]:
# X_words

In [22]:
words = []
for tag, word in ents:
    words.append(word)

In [31]:
doc = nlp(s)

In [26]:
help(displacy.render)

Help on function render in module spacy.displacy:

render(docs: Union[Iterable[Union[spacy.tokens.doc.Doc, spacy.tokens.span.Span]], spacy.tokens.doc.Doc, spacy.tokens.span.Span], style: str = 'dep', page: bool = False, minify: bool = False, jupyter: Optional[bool] = None, options: Dict[str, Any] = {}, manual: bool = False) -> str
    Render displaCy visualisation.
    
    docs (Union[Iterable[Doc], Doc]): Document(s) to visualise.
    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    jupyter (bool): Override Jupyter auto-detection.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    RETURNS (str): Rendered HTML markup.
    
    DOCS: https://spacy.io/api/top-level#displacy.render
    USAGE: https://spacy.io/usage/visualizers



In [32]:
# Attempt 
displacy.render(doc, jupyter=True, style = 'ent')

In [42]:
help(doc.char_span)

Help on built-in function char_span:

char_span(...) method of spacy.tokens.doc.Doc instance
    Doc.char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode='strict')
    Create a `Span` object from the slice
            `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
            created.
    
            doc (Doc): The parent document.
            start_idx (int): The index of the first character of the span.
            end_idx (int): The index of the first character after the span.
            label (uint64 or string): A label to attach to the Span, e.g. for
                named entities.
            kb_id (uint64 or string):  An ID from a KB to capture the meaning of a
                named entity.
            vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
                the span.
            alignment_mode (str): How character indices are aligned to token
                boundaries. Options: "stric

In [44]:
# import spacy
# from spacy import displacy

nlp = spacy.blank('en')
raw_text = "The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."

doc = nlp.make_doc(raw_text)
spans = [[4, 10, "POS"], [838, 853, "ORG"]]   #[870, 888, "POS"], [892, 920, "ORG"], [925, 929, "ENGLEVEL"],          [987, 1002, "SKILL"]
ents = []
for span_start, span_end, label in spans:
    print(span_start, span_end, label)

    ent = doc.char_span(span_start, span_end, label=label)
    
    print(ent)
    if ent is None:
        continue

    ents.append(ent)

doc.ents = ents
displacy.render(doc, style="ent", jupyter=True)

4 10 POS
Indian
838 853 ORG
None


In [43]:
len(raw_text)

261

In [35]:
doc

The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well.

In [62]:
spacy.util.get_words_and_spaces(words, s)

ValueError: [E194] Unable to aligned mismatched text 'In March the joint study reported that it was “extremely unlikely” that the virus had been released in a laboratory accident. Dr Ben Embarek revealed that this conclusion did not come from a balanced assessment of all the relevant evidence but from a steadfast refusal by the Chinese members of the joint study to support anything stronger.' and words '['In', 'March', 'the', 'joint', 'study', 'reported', 'that', 'it', 'was', 'extremely', 'unlikely', 'that', 'the', 'virus', 'had', 'been', 'released', 'in', 'a', 'laboratory', 'accident', '.', 'Dr', 'Ben', 'Embarek', 'revealed', 'that', 'this', 'conclusion', 'did', 'not', 'come', 'from', 'a', 'balanced', 'assessment', 'of', 'all', 'the', 'relevant', 'evidence', 'but', 'from', 'a', 'steadfast', 'refusal', 'by', 'the', 'Chinese', 'members', 'of', 'the', 'joint', 'study', 'to', 'support', 'anything', 'stronger', '.']'.

In [73]:
# named_ent_tags = 
[tag for tag, word in ents]

['O',
 'B-tim',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [57]:
s

'In March the joint study reported that it was “extremely unlikely” that the virus had been released in a laboratory accident. Dr Ben Embarek revealed that this conclusion did not come from a balanced assessment of all the relevant evidence but from a steadfast refusal by the Chinese members of the joint study to support anything stronger.'

In [60]:
help(spacy.util.get_words_and_spaces)

Help on function get_words_and_spaces in module spacy.util:

get_words_and_spaces(words: Iterable[str], text: str) -> Tuple[List[str], List[bool]]
    Given a list of words and a text, reconstruct the original tokens and
    return a list of words and spaces that can be used to create a Doc. This
    can help recover destructive tokenization that didn't preserve any
    whitespace information.
    
    words (Iterable[str]): The words.
    text (str): The original text.
    RETURNS (Tuple[List[str], List[bool]]): The words and spaces.



In [64]:
s

'In March the joint study reported that it was “extremely unlikely” that the virus had been released in a laboratory accident. Dr Ben Embarek revealed that this conclusion did not come from a balanced assessment of all the relevant evidence but from a steadfast refusal by the Chinese members of the joint study to support anything stronger.'

In [None]:
spacy.training.iob_utils.biluo_tags_to_spans

In [69]:
from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=words, ents=named_ent_tags)

In [70]:
doc

In March the joint study reported that it was extremely unlikely that the virus had been released in a laboratory accident . Dr Ben Embarek revealed that this conclusion did not come from a balanced assessment of all the relevant evidence but from a steadfast refusal by the Chinese members of the joint study to support anything stronger . 

In [74]:
displacy.render(doc, style = "ent", jupyter = False)

'<div class="entities" style="line-height: 2.5; direction: ltr">In \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    March\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">tim</span>\n</mark>\n the joint study reported that it was extremely unlikely that the virus had been released in a laboratory accident . \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Dr Ben Embarek\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">per</span>\n</mark>\n revealed that this conclusion did not come from a balanced assessment of all the relevant evidence but from a steadfast refusal by the \n<mark class="entity" style="background: #feca74; padding: 0.45em 0.

In [75]:
# [ent for ent in named_ent_tags if len(ent) > 1]