In [1]:
import random
from pathlib import Path
import spacy
from spacy.training.example import Example
from spacy.pipeline.tagger import Tagger
nlp = spacy.load("en_core_web_sm")

In [20]:
def lowercase_sentences(text):
    doc = nlp(text)

    modified_text = ""

    for sent in doc.sents:
        modified_text += sent.text.lower()+ " " 

    return modified_text

In [3]:
def remove_punctuation(text):
    doc = nlp(text)

    modified_tokens = [token.text for token in doc if not token.is_punct]

    modified_text = " ".join(modified_tokens)

    return modified_text

In [4]:
def remove_stopwords(text):
    doc = nlp(text)

    modified_tokens = [token.text for token in doc if not token.is_stop]

    modified_text = " ".join(modified_tokens)

    return modified_text

In [5]:
sent1 = "USTHB is an Algerian university"
sent1 = lowercase_sentences(sent1)
# sent1 = remove_punctuation(sent1)
# sent1 = remove_stopwords(sent1)
sent2 = "LRIA is a research institute"
sent2 = lowercase_sentences(sent2)
# sent2 = remove_punctuation(sent2)
# sent2 = remove_stopwords(sent2)
sent3 = "Cars are fast"
sent3 = lowercase_sentences(sent3)
sent4 = "Cats and pillows"
sent4 = lowercase_sentences(sent4)

In [6]:


# Define a mapping from string labels to unique integer values
TAG_MAP = {
    'N': {'pos': 'NOUN'},
    'V': {'pos': 'VERB'},
    'J': {'pos': 'ADJ'},
    'ORG': {'pos': 'ORG'},
    'NNS': {'pos': 'NOUN', 'number': 'plural'}  # New tag for plural nouns
}



TRAIN_DATA = [
    ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
    ("Eat blue ham", {'tags': ['V', 'J', 'N']}),
    (sent1, {'tags': ['ORG', 'V', 'N', 'J', 'N']}),
    (sent2, {'tags': ['ORG', 'V','N', 'N', 'N']}),
    (sent3, {'tags': ['NNS', 'V', 'J']}),
    (sent4, {'tags': ['NNS', 'N', 'NNS']})
]

def train_pos_tagger(lang='en', output_dir=None, n_iter=40):
    nlp = spacy.blank(lang)
    
    # Create the tagger and add it to the pipeline with the name 'tagger'
    tagger = nlp.add_pipe('tagger')
    
    # Add custom labels to the tagger
    for tag in TAG_MAP.keys():
        tagger.add_label(tag)
        
    
    optimizer = nlp.begin_training()
    
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, {"tags": annotations['tags']})
            nlp.update([example], drop=0.5, losses=losses)
        
        # print(losses)
    
    
    # ~~~~~~~~~~~~~~~~~~~~~~~~ first example
    test_text = "i go to USTHB"
    test_text = lowercase_sentences(test_text)
    test_text = remove_punctuation(test_text)
    test_text = remove_stopwords(test_text)
    doc = nlp(test_text)
    print(test_text)
    
    
    
    # print('Tags', [(t.text, t.tag_) for t in doc])
    
    if output_dir is not None:
        output_dir = Path(output_dir)
        
    if not output_dir.exists():
        output_dir.mkdir()
    
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)
    
    # Test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    doc = nlp2(test_text)
    print('Tags', [(t.text, t.tag_) for t in doc])
    
    
    # ~~~~~~~~~~~~~~~~~~~~~~~~ second example
    test_text = "a cat and cars are fast"
    test_text = lowercase_sentences(test_text)
    test_text = remove_punctuation(test_text)
    test_text = remove_stopwords(test_text)
    doc = nlp(test_text)
    print(test_text)
    
    
    
    # print('Tags', [(t.text, t.tag_) for t in doc])
    
    if output_dir is not None:
        output_dir = Path(output_dir)
        
    if not output_dir.exists():
        output_dir.mkdir()
    
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)
    
    # Test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    doc = nlp2(test_text)
    print('Tags', [(t.text, t.tag_) for t in doc])

train_pos_tagger(output_dir="C:/Users/Lenovo/Documents/M1S1/TAL/TP")



usthb
Saved model to C:\Users\Lenovo\Documents\M1S1\TAL\TP
Loading from C:\Users\Lenovo\Documents\M1S1\TAL\TP
Tags [('usthb', 'ORG')]
cat cars fast
Saved model to C:\Users\Lenovo\Documents\M1S1\TAL\TP
Loading from C:\Users\Lenovo\Documents\M1S1\TAL\TP
Tags [('cat', 'NNS'), ('cars', 'V'), ('fast', 'J')]


## Training our own NER Tagger

In [7]:
import plac
import random
from pathlib import Path
import spacy

nlp = spacy.load("en_core_web_sm")
# training data
TRAIN_DATA = [
('Who is Shaka Khan?', {
'entities': [(7, 17, 'PERSON')]
}),
('I like London and Berlin.', {
'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
})
]

In [8]:
import random
import warnings
from pathlib import Path
import spacy
from spacy.training.example import Example

# training data
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]

# Initialize a spaCy model
nlp = spacy.blank("en")

# Create the built-in pipeline components and add them to the pipeline
if "ner" not in nlp.pipe_names:
    # ner = spacy.pipeline.ner.EntityRecognizer(nlp.vocab)
    # nlp.add_pipe(ner, last=True)
    ner = nlp.add_pipe('ner', last=True)

# Add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Only train NER
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    # Show warnings for misaligned entity spans once
    warnings.filterwarnings("once", category=UserWarning, module='spacy')

    # Reset and initialize the weights randomly
    nlp.begin_training()
    for itn in range(100):  # Specify the number of training iterations
        random.shuffle(TRAIN_DATA)
        losses = {}
        # Batch up the examples using spaCy's minibatch
        examples = []
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)
        nlp.update(examples, drop=0.5, losses=losses)
        # print("Losses", losses)

# Test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

# Save the model to an output directory
output_dir = Path("output_model")
if not output_dir.exists():
    output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


Entities [('Shaka Khan', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), ('Khan', 'PERSON', 1), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
Saved model to output_model
Loading from output_model
Entities [('Shaka Khan', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), ('Khan', 'PERSON', 1), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]


In [11]:
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

# new entity label
LABEL = "ANIMAL"

# training data
TRAIN_DATA = [
    (
        "Horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ),
    ("Do they bite?", {"entities": []}),
    (
        "horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ),
    ("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}),
    (
        "they pretend to care about your feelings, those horses",
        {"entities": [(48, 54, LABEL)]},
    ),
    ("horses?", {"entities": [(0, 6, LABEL)]}),
]

def main(output_dir, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    
    nlp = spacy.blank("en")  # create blank Language class
    print("Created blank 'en' model")
    
    # Add entity recognizer to model
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label(LABEL)  # add new entity label to entity recognizer

    optimizer = nlp.begin_training()
    
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        sizes = compounding(1.0, 4.0, 1.001)
        
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in minibatch(TRAIN_DATA, size=sizes):
                texts, annotations = zip(*batch)
                example = []
                # update the examples with the correct format
                for i in range(len(texts)):
                    doc = nlp.make_doc(texts[i])
                    example.append(Example.from_dict(doc, annotations[i]))
                nlp.update(example, drop=0.35, losses=losses)
            # print("Losses", losses)

    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta["name"] = "animal"  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

    # test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(test_text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)

if __name__ == "__main__":
    main("output_model", n_iter=30)


Created blank 'en' model
Entities in 'Do you like horses?'
ANIMAL horses
Saved model to output_model
Loading from output_model
ANIMAL horses


In [26]:
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

# new entity label
LABEL = "COLOR"

# training data
TRAIN_DATA = [
    (
        "The sky was a beautiful shade of blue on a clear summer day",
        {"entities": [(33, 37, LABEL)]},
    ),
    ("She painted her room a vibrant shade of red to create a bold accent wall", {"entities": [(40,43,LABEL)]}),
    (
        "The leaves on the trees turned a brilliant gold in the autumn",
        {"entities": [(43, 47, LABEL)]},
    ),
    ("His favorite color is green, and he loves spending time in nature", {"entities": [(22, 27, LABEL)]}),
    
    ("The dress she wore to the party was a stunning shade of royal blue", {"entities": [(62, 66, LABEL)]})
]
TRAIN_DATA = [(text.lower(), annotations) for text, annotations in TRAIN_DATA]

def main(output_dir, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    
    nlp = spacy.blank("en")  # create blank Language class
    print("Created blank 'en' model")
    
    # Add entity recognizer to model
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label(LABEL)  # add new entity label to entity recognizer

    optimizer = nlp.begin_training()
    
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        sizes = compounding(1.0, 4.0, 1.001)
        
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in minibatch(TRAIN_DATA, size=sizes):
                texts, annotations = zip(*batch)
                example = []
                # update the examples with the correct format
                for i in range(len(texts)):
                    doc = nlp.make_doc(texts[i])
                    example.append(Example.from_dict(doc, annotations[i]))
                nlp.update(example, drop=0.35, losses=losses)
            # print("Losses", losses)

    # test the trained model
    test_text = remove_punctuation(remove_stopwords("Do you like blue?")).lower()
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.meta["name"] = "color"  # rename model
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

    # test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(test_text)
    for ent in doc2.ents:
        print(ent.label_, ent.text)

if __name__ == "__main__":
    main("output_model", n_iter=50)


Created blank 'en' model
Entities in 'like blue'
COLOR blue
Saved model to output_model
Loading from output_model
COLOR blue
