# Basics

References:
* https://spacy.io/usage/training

## Command-line interface

* `python -m spacy convert input_file [output_dir]`: Convert input_file to spaCy's JSON training format

* `python -m spacy train lang output_dir train_file dev_file`: Training

* `python -m spacy debug-data lang train_file dev_file`:  Data format validation

__Example__:

```python
git clone https://github.com/UniversalDependencies/UD_Spanish-AnCora

mkdir ancora-json
python -m spacy convert UD_Spanish-AnCora/es_ancora-ud-train.conllu ancora-json
python -m spacy convert UD_Spanish-AnCora/es_ancora-ud-dev.conllu ancora-json

train_file = ancora-json/es_ancora-ud-train.json
dev_file = ancora-json/es_ancora-ud-dev.json
python -m spacy debug-data train_file dev_file  --verboase

mkdir models
python -m spacy train es models train_file dev_file
```

## Training NER

* `optimizer = nlp.begin_training()`
* `optimizer = nlp.resume_training()`
* `optimizer = nlp.entity.create_optimizer()`

In [34]:
import spacy
import random
from spacy.util import minibatch, compounding

NEW_LABEL = "ANIMAL"

TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
    
    ("Horses are too tall and they pretend to care about your feelings", {"entities": [(0, 6, NEW_LABEL)]}),
    ("Do they bite?", {"entities": []}),
    ("horses are too tall and they pretend to care about your feelings", {"entities": [(0, 6, NEW_LABEL)]}),
    ("horses pretend to care about your feelings", {"entities": [(0, 6, NEW_LABEL)]}),
    ("they pretend to care about your feelings, those horses", {"entities": [(48, 54, NEW_LABEL)]}),
    ("horses?", {"entities": [(0, 6, NEW_LABEL)]}),
]

model = 'en'
n_iter = 100
output_dir = './models'

# Load the model:
if model is not None:
    nlp = spacy.load(model)
else:
    nlp = spacy.blank("en")
    
# Get or create the pipe 'ner':
if "ner" in nlp.pipe_names:
    ner = nlp.get_pipe("ner")
else:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)

# Add labels to ner:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        
# Disable other pipes during training:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
nlp.disable_pipes(*other_pipes)

# Set an optimizer.
if model is None:  # If we're training a new model, reset and initialize the weights of the model randomly.
    optimizer = nlp.begin_training()    
else:
    optimizer = nlp.resume_training()
    
    
# Train the model:
for itn in range(n_iter):
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, drop=0.2, losses=losses, sgd=optimizer)
#    print("Epoch{}: Losses {}".format(itn, losses))
    
    
# Test the model:
test_text = "Shaka Khan lives in Berlin. Do you like horses?"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)
    
    
# Save the model:
nlp.to_disk(output_dir)

# Test the saved model:
# nlp2 = spacy.load(output_dir)
# for text, _ in TRAIN_DATA:
#     doc = nlp2(text)
#     print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
#     print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities in 'Shaka Khan lives in Berlin. Do you like horses?'
PERSON Shaka Khan
LOC Berlin
ANIMAL horses
