## Setting up the work enviroment

In [2]:
# Install the plaidml backend
## DO THIS BEFORE IMPORTING KERAS OR TENSOR TO USE PLAIDML
import plaidml.keras

plaidml.keras.install_backend()

In [3]:
# Help MacOS be able to use Keras
import os
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

# Gets rid of the processor warning.
#os.environ['KMP_DUPLICATE_LIB_OK']='True' # haven't tried yet

In [4]:
# libraries
import itertools
import keras
import matplotlib.pyplot as plt
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re
import seaborn as sns
from scipy import stats as st
import spacy
from spacy import displacy
from tqdm import tqdm
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [5]:
# display graphs inside notebooks
%matplotlib inline

In [6]:
# Versions
print("keras version:", keras.__version__)
print("nltk version:", nltk.__version__)
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("plaidml version:", plaidml.__version__)
print("re version:", re.__version__)
print("Seaborn version:", sns.__version__)
print("spacy version:", spacy.__version__)

keras version: 2.2.4
nltk version: 3.4.5
numpy version: 1.18.1
pandas version: 1.0.1
plaidml version: 0.7.0
re version: 2.2.1
Seaborn version: 0.10.0
spacy version: 2.0.12


In [7]:
# Directories & Files
os.listdir()

# Datasets directory
directory = "./datasets/"

## Preparing Datasets

In [10]:
# creating the DataFrames dynamically
# 1st step: store the names and filenames of the files as a key-value pair in a dictionary
datasets = {f"{re.sub('.csv', '', filename.lower())}": filename 
            for filename in os.listdir(directory)}

In [11]:
# checking the datasets dict
datasets

{'.ds_store': '.DS_Store', 'ner': 'ner.csv', 'ner_dataset': 'ner_dataset.csv'}

In [12]:
# deleling '.ds_store': '.DS_Store' from the datasets dic
del datasets[".ds_store"]

# checking the datasets dic
datasets

{'ner': 'ner.csv', 'ner_dataset': 'ner_dataset.csv'}

In [13]:
# 2nd step: for each key in the datasets dictionary, create a DF
for name in datasets:
    print(name)
    globals()[name] = pd.read_csv(directory + datasets[name], encoding = "ISO-8859-1", 
                              error_bad_lines = False)

ner


b'Skipping line 281837: expected 25 fields, saw 34\n'


ner_dataset


In [14]:
# checking the head
ner_dataset.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [16]:
# different ways to check the % of nulls in a df/series
print("Abs by Column:\n", ner_dataset.isna().sum())
print("\n% by Column:\n", ner_dataset.isna().mean().round(4) * 100)
print("\n% by Total:", round((ner_dataset.isnull().any(axis = 1).sum() / len(ner_dataset)) 
                                * 100, 2))

Abs by Column:
 Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

% by Column:
 Sentence #    95.43
Word           0.00
POS            0.00
Tag            0.00
dtype: float64

% by Total: 95.43


In [17]:
# checking uniques in Sentence #
ner_dataset["Sentence #"].unique()

# value counts in Sentence #
ner_dataset["Sentence #"].value_counts()

Sentence: 23021    1
Sentence: 31559    1
Sentence: 9781     1
Sentence: 18749    1
Sentence: 3784     1
                  ..
Sentence: 21353    1
Sentence: 4722     1
Sentence: 17992    1
Sentence: 27693    1
Sentence: 38734    1
Name: Sentence #, Length: 47959, dtype: int64

In [18]:
# after looking at it and reading about it, I figured that when Sentence # is NaN, the 
# corresponding Word in that row belongs to the previous sentence. As such, the foward fill
# method is an approriate (ideal even?) way to deal with the NaN's.
ner_dataset

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


In [19]:
# Foward-filling NaN's in the dataset
ner_dataset.fillna(method = "ffill", inplace = True)

In [20]:
# Checking if there are still NaN's
ner_dataset.isna().sum()
ner_dataset.isnull().sum()

# Checking the ner_dataset
ner_dataset

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [21]:
# checking unique tags value counts
ner_dataset["Tag"].value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [22]:
# checking unique tags value counts %
ner_dataset["Tag"].value_counts() / ner_dataset["Tag"].value_counts().sum()

O        0.846776
B-geo    0.035900
B-tim    0.019391
B-org    0.019210
I-per    0.016452
B-per    0.016203
I-org    0.016006
B-gpe    0.015135
I-geo    0.007071
I-tim    0.006226
B-art    0.000383
B-eve    0.000294
I-art    0.000283
I-eve    0.000241
B-nat    0.000192
I-gpe    0.000189
I-nat    0.000049
Name: Tag, dtype: float64

In [33]:
%%time
# Creating a NLP object // Loading the model
#nlp = spacy.load("en_core_web_sm") # 11MB
#nlp = spacy.load("en_core_web_md") # 91MB
nlp = spacy.load("en_core_web_lg") # 789MB

CPU times: user 7.18 s, sys: 739 ms, total: 7.92 s
Wall time: 7.99 s


## Building the pipeline

In [34]:
# pipeline component names
print(nlp.pipe_names)

['tagger', 'parser', 'ner']


## Training the model

0) Decide to start w/a pre-trained model an do tranfer learning, or build from scratch;

1) Initialize the model weights randomly with nlp.begin_training;

2) Predict a few examples with the current weights by calling nlp.update;

3) Compare prediction with true labels;

4) Calculate how to change weights to improve predictions;

5) Update weights slightly;

6) Go back to 2.

---

If we're not starting with a pre-trained model, we first initialize the weights randomly.

---

![title](training_in_spacy.svg)



Training data: Examples and their annotations.

Text: The input text the model should predict a label for.

Label: The label the model should predict.

Gradient: How to change the weights.

---

The entity recognizer tags words and phrases in context

Each token can only be part of one entity

Examples need to come with context

("iPhone X is coming", {'entities': [(0, 8, 'GADGET')]})

Texts with no entities are also important

("I need a new phone! Any tips?", {'entities': []})

Goal: teach the model to generalize

---

Because the entity recognizer predicts entities in context, it also needs to be trained on entities and their surrounding context.



It's also very important for the model to learn words that aren't entities.

In this case, the list of span annotations will be empty.

Our goal is to teach the model to recognize new entities in similar contexts, even if they weren't in the training data.

## The Training Loop:
The steps of a training loop:

1) Loop for a number of times.

2) Shuffle the training data.

3) Divide the data into batches.

4) Update the model for each batch.

5) Save the updated model.


---



We usually need to perform it several times, for multiple iterations, so that the model can learn from it effectively. If we want to train for 10 iterations, we need to loop 10 times.



To prevent the model from getting stuck in a suboptimal solution, we randomly shuffle the data for each iteration. This is a very common strategy when doing stochastic gradient descent.



Next, we divide the training data into batches of several examples, also known as minibatching. This makes it easier to make a more accurate estimate of the gradient.



Finally, we update the model for each batch, and start the loop again until we've reached the last iteration.



We can then save the model to a directory and use it in spaCy.


---

The training data are the examples we want to update the model with.



The text should be a sentence, paragraph or longer document. For the best results, it should be similar to what the model will see at runtime.



The label is what we want the model to predict. This can be a text category, or an entity span and its type.



The gradient is how we should change the model to reduce the current error. It's computed when we compare the predicted label to the true label.

In [None]:
#

In [37]:
# Training Best Case Practices
"""
Problem 1: Models can "forget" things

— Existing model can overfit on new data
e.g.: if you only update it with WEBSITE, it can "unlearn" what a PERSON is
— Also known as "catastrophic forgetting" problem.

If you're updating an existing model with new data, especially new labels, it can overfit 
and adjust too much to the new examples.

Solution 1: Mix in previously correct predictions

— For example, if you're training WEBSITE, also include examples of PERSON. 
— Run existing spaCy model over data and extract all other relevant entities
"""
# BAD:
TRAINING_DATA = [
    ('Reddit is a website', {'entities': [(0, 6, 'WEBSITE')]})
]

# GOOD:
TRAINING_DATA = [
    ('Reddit is a website', {'entities': [(0, 6, 'WEBSITE')]}),
    ('Obama is a person', {'entities': [(0, 5, 'PERSON')]})
]

"""
Problem 2: Models can't learn everything

— spaCy's models make predictions based on local context
— Model can struggle to learn if decision is difficult to make based on context
— Label scheme needs to be consistent and not too specific
For example: CLOTHING is better than ADULT_CLOTHING and CHILDRENS_CLOTHING

for named entities, the surrounding words are most important.

Solution 2: Plan your label scheme carefully

— Pick categories that are reflected in local context
— More generic is better than too specific
— Use rules to go from generic labels to specific categories

Before you start training and updating models, it's worth taking a step back and planning 
your label scheme.

Try to pick categories that are reflected in the local context and make them more generic 
if possible.

You can always add a rule-based system later to go from generic to specific.
"""
# BAD:
LABELS = ['ADULT_SHOES', 'CHILDRENS_SHOES', 'BANDS_I_LIKE']

# GOOD:
LABELS = ['CLOTHING', 'BAND']

In [None]:
# Example: Building a training loop
import spacy
import random
import json

with open("exercises/gadgets.json") as f:
    TRAINING_DATA = json.loads(f.read())

nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label("GADGET")

# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)

In [None]:
# Example loop
TRAINING_DATA = [
    ("How to preorder the iPhone X", {'entities': [(20, 28, 'GADGET')]})
    # And many more examples...
]
# Loop for 10 iterations
for i in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    # Create batches and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA):
        # Split the batch in texts and annotations
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        # Update the model
        nlp.update(texts, annotations)

# Save the model
nlp.to_disk(path_to_model)

In [None]:
# Example: creating patterns to quickly bootstrap some training data for our model.
"""
spaCy’s rule-based Matcher is a great way to quickly create training data for named entity 
models. A list of sentences is available as the variable TEXTS. You can print it the IPython
shell to inspect it. We want to find all mentions of different iPhone models, so we can 
create training data to teach a model to recognize them as 'GADGET'.
"""
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

with open("exercises/iphone.json") as f:
    TEXTS = json.loads(f.read())

nlp = English()
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]

# Add patterns to the matcher
matcher.add("GADGET", None, pattern1, pattern2)

In [None]:
# Example: using the match patterns created above to bootstrap a set of training examples.
import json
from spacy.matcher import Matcher
from spacy.lang.en import English

with open("exercises/iphone.json") as f:
    TEXTS = json.loads(f.read())

nlp = English()
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
matcher.add("GADGET", None, pattern1, pattern2)

TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

print(*TRAINING_DATA, sep="\n")
"""
Before you train a model with the data, you always want to double-check that your matcher 
didn't identify any false positives. But this process is still much faster than doing 
*everything* manually.
"""

In [None]:
# Import the English language class
#from spacy.lang.en import English

In [None]:
# framework to build custom pipeline components
def custom_component(doc):
    # Do something to the doc here
    return doc

nlp.add_pipe(custom_component) # first, last, before, after

In [None]:
# example
# Define a custom component
def custom_component(doc):
    # Print the doc's length
    print('Doc length:', len(doc))
    # Return the doc object
    return doc

# Add the component first in the pipeline
nlp.add_pipe(custom_component, first=True)

# Print the pipeline component names
print('Pipeline:', nlp.pipe_names)

In [None]:
# example for rule-based entity matching
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label 'ANIMAL'
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc


# Add the component to the pipeline after the 'ner' component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

In [35]:
#example: 
"""
create an attribute getter that returns a Wikipedia search URL if the span is a person, 
organization, or location
"""
from spacy.tokens import Span

#nlp = spacy.load("en_core_web_sm")

def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension("wikipedia_url", getter = get_wikipedia_url) 

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)

over fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


In [None]:
# Best pratices:
#BAD:
docs = [nlp(text) for text in LOTS_OF_TEXTS]

# GOOD:
docs = list(nlp.pipe(LOTS_OF_TEXTS)) # Much faster than calling nlp on each text

#If you only need a tokenized Doc object, you can use the nlp dot make doc method instead, 
# which takes a text and returns a Doc.
#BAD:
doc = nlp("Hello world")

#GOOD:
doc = nlp.make_doc("Hello world!")

In [None]:
# Disabling pipeline components
#Use nlp.disable_pipes to temporarily disable one or more pipes
# Disable tagger and parser
with nlp.disable_pipes('tagger', 'parser'):
    # Process the text and print the entities
    doc = nlp(text)
    print(doc.ents)
    
"""
After the with block, the disabled pipeline components are automatically restored.
In the with block, spaCy will only run the remaining components.
"""

In [None]:
"""
example of efficient text processing. 
iterating over the doc objects yielded by nlp.pipe.
"""
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/tweets.json") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the adjectives
for doc in nlp.pipe(TEXTS):
    print([token.text for token in doc if token.pos_ == "ADJ"])"])
    
"""
and:
"""
import json
import spacy

nlp = spacy.load("en_core_web_sm")

with open("exercises/tweets.json") as f:
    TEXTS = json.loads(f.read())

# Process the texts and print the entities
docs = list(nlp.pipe(TEXTS))
entities = [doc.ents for doc in docs]
print(*entities)

"""

"""
from spacy.lang.en import English

nlp = English()

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Create a list of patterns for the PhraseMatcher
patterns = list(nlp.pipe(people)) # instead of patterns = [nlp(person) for person in people]

In [23]:
# Creating a document to test
doc = nlp("Augusta Ada King, Countess of Lovelace (née Byron; 10 December 1815 – 27 November 1852) was an English mathematician and writer, chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation, and published the first algorithm intended to be carried out by such a machine. As a result, she is sometimes regarded as the first to recognise the full potential of a 'computing machine' and one of the first computer programmers.[2][3][4]"
          "Augusta Byron was the only legitimate child of poet Lord Byron and his wife Lady Byron.[5] All of Byron's other children were born out of wedlock to other women.[6] Byron separated from his wife a month after Ada was born and left England forever four months later. He commemorated the parting in a poem that begins, 'Is thy face like thy mother's my fair child! ADA! sole daughter of my house and heart?'.[7] He died of disease in the Greek War of Independence when Ada was eight years old. Her mother remained bitter and promoted Ada's interest in mathematics and logic in an effort to prevent her from developing her father's perceived insanity. Despite this, Ada remained interested in Byron, naming her two sons Byron and Gordon. Upon her eventual death, she was buried next to him at her request. Although often ill in her childhood, Ada pursued her studies assiduously. She married William King in 1835. King was made Earl of Lovelace in 1838, Ada thereby becoming Countess of Lovelace.2"          
          "Her educational and social exploits brought her into contact with scientists such as Andrew Crosse, Charles Babbage, Sir David Brewster, Charles Wheatstone, Michael Faraday and the author Charles Dickens, contacts which she used to further her education. Ada described her approach 'as poetical science'[8] and herself as an 'Analyst (& Metaphysician)'.[9]"
          "When she was a teenager, her mathematical talents led her to a long working relationship and friendship with fellow British mathematician Charles Babbage, who is known as 'the father of computers'. She was in particular interested in Babbage's work on the Analytical Engine. Lovelace first met him in June 1833, through their mutual friend, and her private tutor, Mary Somerville."
          "Between 1842 and 1843, Ada translated an article by Italian military engineer Luigi Menabrea on the calculating engine, supplementing it with an elaborate set of notes, simply called Notes. These notes contain what many consider to be the first computer program—that is, an algorithm designed to be carried out by a machine. Other historians reject this perspective and point out that Babbage's personal notes from the years 1836/1837 contain the first programs for the engine.[10] Lovelace's notes are important in the early history of computers. She also developed a vision of the capability of computers to go beyond mere calculating or number-crunching, while many others, including Babbage himself, focused only on those capabilities.[11] Her mindset of 'poetical science' led her to ask questions about the Analytical Engine (as shown in her notes) examining how individuals and society relate to technology as a collaborative tool.[6]"
          "She died of uterine cancer in 1852 at the age of 36."
          "source: https://en.wikipedia.org/wiki/Ada_Lovelace")

# Finding entities
for entity in doc.ents:
    print(entity.text, entity.label_)

Augusta Ada King PERSON
Countess of Lovelace PERSON
Byron PERSON
10 December 1815 – 27 November 1852 DATE
English NORP
Charles Babbage's PERSON
the Analytical Engine ORG
first ORDINAL
first ORDINAL
first ORDINAL
one CARDINAL
first ORDINAL
Byron PERSON
Byron PERSON
Lady Byron.[5 PERSON
Byron PERSON
women.[6 ORG
Byron PERSON
a month DATE
Ada PERSON
England GPE
four months later DATE
ADA ORG
the Greek War of Independence EVENT
Ada PERSON
eight years old DATE
Ada PERSON
Ada PERSON
Byron PERSON
two CARDINAL
Byron PERSON
Gordon PERSON
Ada PERSON
William King PERSON
1835 DATE
Earl of Lovelace PERSON
1838 DATE
Ada PERSON
Andrew Crosse PERSON
Charles Babbage PERSON
David Brewster PERSON
Charles Wheatstone PERSON
Michael Faraday PERSON
Charles Dickens PERSON
Ada PERSON
science'[8 PERSON
British NORP
Charles Babbage PERSON
Babbage ORG
the Analytical Engine ORG
Lovelace PERSON
first ORDINAL
June 1833 DATE
Mary Somerville PERSON
Between 1842 and 1843 DATE
Ada PERSON
Italian NORP
Luigi Menabrea PERS

In [3]:
# displaying the doc w/entities
#displacy.serve(doc, style = "ent") # open http://localhost:5000

# color and backgournd (bg) aren't working?
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}

displacy.render(doc, style = "ent", jupyter = True, options = options) # to read directly in jupyter

In [4]:
lovelace = """Augusta Ada King, Countess of Lovelace (née Byron; 10 December 1815 – 27 
November 1852) was an English mathematician and writer, chiefly known for her work on 
Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. 
She was the first to recognise that the machine had applications beyond pure calculation, 
and published the first algorithm intended to be carried out by such a machine. As a result, she is sometimes regarded as the first to recognise the full potential of a 'computing machine' and one of the first computer programmers.[2][3][4]"
Augusta Byron was the only legitimate child of poet Lord Byron and his wife Lady Byron.[5] 
All of Byron's other children were born out of wedlock to other women.[6] Byron separated 
from his wife a month after Ada was born and left England forever four months later. He 
commemorated the parting in a poem that begins, 'Is thy face like thy mother's my fair 
child! ADA! sole daughter of my house and heart?'.[7] He died of disease in the Greek War 
of Independence when Ada was eight years old. Her mother remained bitter and promoted Ada's 
interest in mathematics and logic in an effort to prevent her from developing her father's 
perceived insanity. Despite this, Ada remained interested in Byron, naming her two sons 
Byron and Gordon. Upon her eventual death, she was buried next to him at her request. 
Although often ill in her childhood, Ada pursued her studies assiduously. She married 
William King in 1835. King was made Earl of Lovelace in 1838, Ada thereby becoming Countess 
of Lovelace.2          

Her educational and social exploits brought her into contact with scientists such as Andrew 
Crosse, Charles Babbage, Sir David Brewster, Charles Wheatstone, Michael Faraday and the 
author Charles Dickens, contacts which she used to further her education. Ada described her 
approach 'as poetical science'[8] and herself as an 'Analyst (& Metaphysician)'.[9]

When she was a teenager, her mathematical talents led her to a long working relationship 
and friendship with fellow British mathematician Charles Babbage, who is known as 'the 
father of computers'. She was in particular interested in Babbage's work on the Analytical 
Engine. Lovelace first met him in June 1833, through their mutual friend, and her private 
tutor, Mary Somerville.

Between 1842 and 1843, Ada translated an article by Italian military engineer Luigi 
Menabrea on the calculating engine, supplementing it with an elaborate set of notes, 
simply called Notes. These notes contain what many consider to be the first computer 
program—that is, an algorithm designed to be carried out by a machine. Other historians 
reject this perspective and point out that Babbage's personal notes from the years 
1836/1837 contain the first programs for the engine.[10] Lovelace's notes are important in 
the early history of computers. She also developed a vision of the capability of computers 
to go beyond mere calculating or number-crunching, while many others, including Babbage 
himself, focused only on those capabilities.[11] Her mindset of 'poetical science' led her 
to ask questions about the Analytical Engine (as shown in her notes) examining how 
individuals and society relate to technology as a collaborative tool.[6]

She died of uterine cancer in 1852 at the age of 36.

source: https://en.wikipedia.org/wiki/Ada_Lovelace"""

In [9]:
# Function to Redact Names
def redact_names(text):
    """
    take a text, convert it to an npl object, merge the id'd entities, check if the token is
    type PERSON, if so, Redact, if not leave it be, and return the final text.
    """
    docx = nlp(text)
    redacted_sentences = list()
    """
    for ent in docx.ents:
        ent.merge()
    """
    
    [ent.merge() for ent in docx.ents]
    
    for token in docx:
        if token.ent_type_ == "PERSON":
            redacted_sentences += "[REDACTED]"
        
        else:
            redacted_sentences += token.string
            
    return "".join(redacted_sentences)

In [10]:
lovelace

'Augusta Ada King, Countess of Lovelace (née Byron; 10 December 1815 – 27 \nNovember 1852) was an English mathematician and writer, chiefly known for her work on \nCharles Babbage\'s proposed mechanical general-purpose computer, the Analytical Engine. \nShe was the first to recognise that the machine had applications beyond pure calculation, \nand published the first algorithm intended to be carried out by such a machine. As a result, she is sometimes regarded as the first to recognise the full potential of a \'computing machine\' and one of the first computer programmers.[2][3][4]"\nAugusta Byron was the only legitimate child of poet Lord Byron and his wife Lady Byron.[5] \nAll of Byron\'s other children were born out of wedlock to other women.[6] Byron separated \nfrom his wife a month after Ada was born and left England forever four months later. He \ncommemorated the parting in a poem that begins, \'Is thy face like thy mother\'s my fair \nchild! ADA! sole daughter of my house and 

In [11]:
# Redacting Names
redact_names(lovelace)

'[REDACTED], [REDACTED](née [REDACTED]; 10 December 1815 – 27 \nNovember 1852) was an English mathematician and writer, chiefly known for her work on \n[REDACTED]proposed mechanical general-purpose computer, the Analytical Engine. \nShe was the first to recognise that the machine had applications beyond pure calculation, \nand published the first algorithm intended to be carried out by such a machine. As a result, she is sometimes regarded as the first to recognise the full potential of a \'computing machine\' and one of the first computer programmers.[2][3][4]"\n[REDACTED]was the only legitimate child of poet Lord [REDACTED]and his wife [REDACTED]] \nAll of [REDACTED]\'s other children were born out of wedlock to other women.[6] [REDACTED]separated \nfrom his wife a month after [REDACTED]was born and left England forever four months later. He \ncommemorated the parting in a poem that begins, \'Is thy face like thy mother\'s my fair \nchild! ADA! sole daughter of my house and heart?\'.

In [13]:
# Can I visualize it with the redaction? -> yes I can
#displacy.render(nlp(redact_names(lovelace)), style = "ent", jupyter = True, options = options)

displacy.serve(nlp(redact_names(lovelace)), style = "ent") # open http://localhost:5000


[93m    Serving on port 5000...[0m
    Using the 'ent' visualizer



127.0.0.1 - - [13/Mar/2020 22:30:58] "GET / HTTP/1.1" 200 21404
127.0.0.1 - - [13/Mar/2020 22:30:59] "GET / HTTP/1.1" 200 21404



    Shutting down server on port 5000.



In [18]:
import joblib

In [19]:
trained_crf = joblib.load("my_crf_trained_model.pkl")

In [20]:
my_model = spacy.load(trained_crf)



OSError: [E050] Can't find model 'CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=10, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.