In [1]:
import plac
import random
from pathlib import Path
import spacy
from spacy.training import Example

## POS Tagging

In [5]:
TAG_MAP = {
    'N': 'NOUN',
    'V': 'VERB',
    'J':'ADJ',
    'NNS' : 'NOUN'
}

TRAIN_DATA = [
    ("I like green eggs", {'tags': ['N', 'V', 'J', 'NNS']}), 
    ("Eat blue ham", {'tags': ['V', 'J', 'N']}), 
]

In [8]:
def main(lang='en', output_dir=None, n_iter=3):

    nlp = spacy.blank(lang) #Créer un modèle spacy vide pour l'anglais
    tagger = nlp.add_pipe("tagger") #Créer un modèle (tagger) qui attribue des tags aux mots d'un texte 

    for item in TAG_MAP:
        tagger.add_label(item) #Ajouter des tags aux tagger

    optimizer = nlp.begin_training()
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], sgd=optimizer, losses=losses)
        print(losses)

    test_text = "I love cats"
    doc = nlp(test_text)
    print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])


In [9]:
main()

{'tagger': 5.29118800163269}
{'tagger': 4.324729919433594}
{'tagger': 2.966841697692871}
Tags [('I', 'N', ''), ('love', 'V', ''), ('cats', 'NNS', '')]


## NER Tagging

In [10]:
# training data
TRAIN_DATA = [
('Who is Shaka Khan?', {
    'entities': [(7, 17, 'PERSON')]
}),
('I like London and Berlin.', {
    'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
}),
 ('I am learning Python', {
    'entities': [(14, 20, 'TECH')]
})
]

In [11]:
def main(model=None, output_dir=None, n_iter=100):
    
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
        
        if 'ner' not in nlp.pipe_names:
            ner = nlp.add_pipe('ner', last=True)
    # otherwise, get it so we can add labels
        else:
            ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    
    # get names of other pipes to disable them during training 
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update(
                    [example],
                    drop=0.5, # dropout-make it harder to memorise data
                    sgd=optimizer, # callable to update weights
                    losses=losses)
            print(losses)
            
    # test the trained model
    doc = nlp('I love Python')
    print()
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [12]:
main()

Created blank 'en' model
{'ner': 13.661589443683624}
{'ner': 12.780549049377441}
{'ner': 11.718456029891968}
{'ner': 10.261194914579391}
{'ner': 8.436154402792454}
{'ner': 7.5132438987493515}
{'ner': 6.195048194378614}
{'ner': 6.048486293293536}
{'ner': 5.976720348466188}
{'ner': 6.699109001085162}
{'ner': 5.371995489054825}
{'ner': 5.552186889311997}
{'ner': 5.176415246911347}
{'ner': 4.257470318378182}
{'ner': 4.86982959206216}
{'ner': 4.734175256508024}
{'ner': 4.251351436658297}
{'ner': 3.8278260016995773}
{'ner': 3.421245729901784}
{'ner': 3.133431753663899}
{'ner': 5.880396975063604}
{'ner': 4.73313140026039}
{'ner': 4.033965459280807}
{'ner': 3.34562244249355}
{'ner': 3.362248985795304}
{'ner': 1.792185184955997}
{'ner': 1.662567946971194}
{'ner': 4.646769463630051}
{'ner': 1.810014566163602}
{'ner': 2.8037084164944583}
{'ner': 2.328709575316255}
{'ner': 0.8061871854944813}
{'ner': 1.8794366380681624}
{'ner': 1.3388374271534043}
{'ner': 0.0940337553091017}
{'ner': 0.672811179859