# 04 - Model Tuning 🔧

## Problem

How can we generate short descriptions including a list of popular beer styles from brewery reviews with natural language processing (NLP)? 

NOTE: This is similar to how Google Maps provides short blurbs for businesses such as “From scratch, Northern Italian dining.” 

For example, "Spacious warehouse brewery with daily food trucks. Allows dogs. Features IPAs, Hazy IPAs, and high-gravity stouts."

## Notebook Objectives

* Evaluate at least 3 models with metrics

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import json
import time
import math
from pprint import pprint

import random
random.seed(42)

# Progress bar
# from tqdm import tqdm, trange
from tqdm.notebook import tqdm, trange

# NLP Toolkit
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding, decaying

# Set default plot size
plt.rcParams['figure.figsize'] = (10, 5)

%matplotlib inline

In [2]:
# Q: unused?
def load_doccano_data(file_path):
    '''Load the Doccano annotation data and return as a list'''
    data = []
    with open(file_path) as file:
        lines = file.readlines()
        for line in lines:
            annot_data = json.loads(line)
            
            # Ignore annotation data without labels and without approval
            if len(annot_data['labels']) == 0 and annot_data['annotation_approver'] == None: continue
            
            # Generate list of entity tuples
            ents = [tuple(entity[:3]) for entity in annot_data['labels']]
            
            # Append tuples of raw text and entities
            data.append((annot_data['text'], {'entities': ents}))
    print(f"Number of examples: {len(data)}")
    return data

In [3]:
# Q: unused?
def save_spacy_train_data(data, output_path):
    '''Save the spaCy data'''
    with open(file_path,'w') as file:
        file.write(str(data))
    print(f"Wrote spaCy training data to {output_path}")

In [4]:
def load_doccano_labels(file_path):
    '''Load the LABELS from a Doccano export'''
    labels = []
    with open(file_path) as file:
        labels = [label['text'] for label in json.load(file)]
    return labels

In [5]:
def update_model_with_labels(model, labels, output_path):
    '''Save model with additional NER labels'''
    nlp = spacy.load(model)
    print(f"Loaded model '{model}'.")

    # Get the NER component so we can add labels
    ner = nlp.get_pipe("ner")
    for label in labels:
        ner.add_label(label)
        print(f"Added label '{label}'.")

    # Save the model to disk
    nlp.to_disk(output_path)
    print(f"Wrote base model to {output_path}.")

In [6]:
def doccano_to_ner_jsonl(file_path, output_path, labels):
    '''Convert Doccano JSONL to NER JSONL'''
    data = []  
    with open(file_path) as file:
        lines = file.readlines()

        for line in lines:
            annot_data = json.loads(line)

            # Ignore annotations without labels and no approval
            if len(annot_data['labels']) == 0 and annot_data['annotation_approver'] == None: continue

            # Rename "labels" to "entities"
            if "labels" in line:
                annot_data["entities"] = annot_data.pop("labels")
            else:
                annot_data["entities"] = []

            # Reorganize the entities
            tmp_ents = []
            for entity in annot_data["entities"]:
                if entity[2] in labels:
                    tmp_ents.append({"start": entity[0], "end": entity[1], "label": entity[2]})
                annot_data["entities"] = tmp_ents
            
            # Append line
            data.append(json.dumps({"entities": annot_data["entities"], "text": annot_data["text"]}))
    
    # Write NER JSONL file
    with open(output_path, 'w') as file:
        file.write(("\n").join(data) + "\n")
    
    print(f"Wrote NER JSONL to {output_path}")

In [7]:
def train_dev_split_ner_jsonl(file_path, train_size=0.8):
    '''Split data into train and dev sets and save'''
    data = []
    
    # Read JSONL lines
    with open(file_path) as file:
        data = file.readlines()
        
    # Randomize data
    random.shuffle(data)
        
    # Calculate split point
    n = math.floor(train_size * len(data))

    # Split the data
    train_data = data[:n]
    dev_data = data[n:]

    # Generate filenames based on type and sample size
    train_file_path = file_path.replace('.jsonl',f'-train-{len(train_data)}.jsonl')
    dev_file_path = file_path.replace('.jsonl',f'-dev-{len(dev_data)}.jsonl')

    # Write files
    with open(train_file_path, 'w') as file:
        file.write("".join(train_data))
    print(f"Wrote NER JSONL training data to {train_file_path} ({len(train_data)} samples)")

    with open(dev_file_path, 'w') as file:
        file.write("".join(dev_data))
    print(f"Wrote NER JSONL dev data to {dev_file_path} ({len(dev_data)} samples)")
    
    return train_file_path, dev_file_path

In [8]:
# Shared variables 
annotations_filename = "../data/annotations/final-training-set/reviews-annotated-ner-200-p2.jsonl"
labels_filename = "../data/annotations/final-training-set/labels.json"
output_path = "../data/annotations/final-training-set/"
labels = load_doccano_labels(labels_filename)
print(labels)

['BEER_STYLE', 'LOCATION', 'BREWERY', 'FEATURE']


## Save Base Models for Transfer Learning

### `en_core_web_sm`

In [9]:
sm_model = "en_core_web_sm"
sm_model_path = "../models/base_model_sm"
update_model_with_labels(sm_model, labels, sm_model_path)

Loaded model 'en_core_web_sm'.
Added label 'BEER_STYLE'.
Added label 'LOCATION'.
Added label 'BREWERY'.
Added label 'FEATURE'.
Wrote base model to ../models/base_model_sm.


### `en_core_web_md`

In [10]:
md_model = "en_core_web_md"
md_model_path = "../models/base_model_md"
update_model_with_labels(md_model, labels, md_model_path)

Loaded model 'en_core_web_md'.
Added label 'BEER_STYLE'.
Added label 'LOCATION'.
Added label 'BREWERY'.
Added label 'FEATURE'.
Wrote base model to ../models/base_model_md.


## Split Train and Test

In [11]:
# Convert Doccano JSONL to NER JSONL
ner_jsonl_file_path = annotations_filename.replace('.jsonl','-ner.jsonl')
doccano_to_ner_jsonl(annotations_filename, ner_jsonl_file_path, labels)

Wrote NER JSONL to ../data/annotations/final-training-set/reviews-annotated-ner-200-p2-ner.jsonl


In [12]:
# Split data into training and dev
train_file_path, dev_file_path = train_dev_split_ner_jsonl(ner_jsonl_file_path, train_size=0.8)

Wrote NER JSONL training data to ../data/annotations/final-training-set/reviews-annotated-ner-200-p2-ner-train-160.jsonl (160 samples)
Wrote NER JSONL dev data to ../data/annotations/final-training-set/reviews-annotated-ner-200-p2-ner-dev-40.jsonl (40 samples)


In [13]:
%%bash -s "$train_file_path" "$output_path"
python -m spacy convert $1 $2 --lang en

[38;5;2m✔ Generated output file (160 documents):
../data/annotations/final-training-set/reviews-annotated-ner-200-p2-ner-train-160.json[0m


In [14]:
%%bash -s "$dev_file_path" "$output_path"
python -m spacy convert $1 $2 --lang en

[38;5;2m✔ Generated output file (40 documents):
../data/annotations/final-training-set/reviews-annotated-ner-200-p2-ner-dev-40.json[0m


In [15]:
spacy_train_file_path = train_file_path.replace('.jsonl', '.json')
spacy_dev_file_path = dev_file_path.replace('.jsonl', '.json')

## Model v1 - Blank Model

In [16]:
%%bash -s "$spacy_train_file_path" "$spacy_dev_file_path"
python -m spacy debug-data en $1 $2 --pipeline ner

[1m
[2K[38;5;2m✔ Corpus is loadable[0m
[1m
Training pipeline: ner
Starting with blank model 'en'
160 training docs
40 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[38;5;3m⚠ Low number of examples to train from a blank model (160)[0m
[1m
[38;5;4mℹ 14768 total words in the data (2755 unique)[0m
[38;5;4mℹ No word vectors present in the model[0m
[1m
[38;5;4mℹ 4 new labels, 0 existing labels[0m
0 missing values (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities consisting of or starting/ending with punctuation[0m
[1m
[38;5;2m✔ 6 checks passed[0m


In [60]:
%%bash -s "$spacy_train_file_path" "$spacy_dev_file_path" "../models/v1"
python -m spacy train en $3 $1 $2 -p ner

[38;5;2m✔ Created output directory: ../models/v1[0m
Training pipeline: ['ner']
Starting with blank model 'en'
Counting training words (limit=0)

Itn  NER Loss   NER P   NER R   NER F   Token %  CPU WPS
---  ---------  ------  ------  ------  -------  -------
  1   2308.440  100.000   0.654   1.299  100.000    29609
  2   1444.723  43.396  15.033  22.330  100.000    23903
  3   3534.244  52.055  24.837  33.628  100.000    29594
  4   1086.787  49.474  30.719  37.903  100.000    28335
  5   1353.483  54.545  31.373  39.834  100.000    29403
  6    926.315  56.863  37.908  45.490  100.000    26456
  7    685.207  58.036  42.484  49.057  100.000    28319
  8    618.671  56.522  42.484  48.507  100.000    28431
  9    635.116  58.824  45.752  51.471  100.000    28669
 10    430.208  57.258  46.405  51.264  100.000    27540
 11    484.619  57.480  47.712  52.143  100.000    31129
 12    332.482  58.065  47.059  51.986  100.000    24187
 13    466.704  57.724  46.405  51.449  100.000    241

                                                       

In [63]:
%%bash -s "$spacy_dev_file_path" "../models/v1/model-best" "../reports/evaluations/"
python -m spacy evaluate $2 $1 --displacy-path $3 --displacy-limit 10

[1m

Time      0.16 s
Words     4226  
Words/s   27195 
TOK       100.00
POS       0.00  
UAS       0.00  
LAS       0.00  
NER P     58.40 
NER R     47.71 
NER F     52.52 
Textcat   0.00  

[38;5;2m✔ Generated 10 parses as HTML[0m
../reports/evaluations




## Model v2 - Transfer Learning with `en_core_web_sm`

In [19]:
%%bash -s "$spacy_train_file_path" "$spacy_dev_file_path" "$sm_model_path"
python -m spacy debug-data en $1 $2 --base-model $3 --pipeline ner

[1m
[2K[38;5;2m✔ Corpus is loadable[0m
[1m
Training pipeline: ner
Starting with base model '../models/base_model_sm'
160 training docs
40 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[1m
[38;5;4mℹ 14768 total words in the data (2755 unique)[0m
[38;5;4mℹ No word vectors present in the model[0m
[1m
[38;5;4mℹ 0 new labels, 4 existing labels[0m
0 missing values (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities consisting of or starting/ending with punctuation[0m
[1m
[38;5;2m✔ 6 checks passed[0m


In [22]:
%%bash -s "$spacy_train_file_path" "$spacy_dev_file_path" "$sm_model_path" "../models/v2"
python -m spacy train en $4 $1 $2 -b $3 -p ner

[38;5;2m✔ Created output directory: ../models/v2[0m
Training pipeline: ['ner']
Starting with base model '../models/base_model_sm'
Extending component from base model 'ner'
Counting training words (limit=0)

Itn  NER Loss   NER P   NER R   NER F   Token %  CPU WPS
---  ---------  ------  ------  ------  -------  -------
  1   1435.517   0.000   0.000   0.000  100.000    22321
  2   1511.661  57.895  14.379  23.037  100.000    25264
  3    941.415  57.955  33.333  42.324  100.000    24755
  4    821.753  61.616  39.869  48.413  100.000    18656
  5    741.171  61.111  43.137  50.575  100.000    24624
  6    711.275  60.504  47.059  52.941  100.000    22703
  7    614.466  62.295  49.673  55.273  100.000    21533
  8    499.789  62.121  53.595  57.544  100.000    21639
  9    512.802  62.319  56.209  59.107  100.000    22672
 10    393.317  62.774  56.209  59.310  100.000    18354
 11    294.463  62.500  55.556  58.824  100.000    25319
 12    257.461  61.702  56.863  59.184  100.000   

                                                       

In [59]:
%%bash -s "$spacy_dev_file_path" "../models/v2/model-best" "../reports/evaluations/"
python -m spacy evaluate $2 $1 --displacy-path $3 --displacy-limit 10

[1m

Time      0.46 s
Words     4226  
Words/s   9212  
TOK       100.00
POS       0.00  
UAS       0.00  
LAS       0.00  
NER P     62.76 
NER R     59.48 
NER F     61.07 
Textcat   0.00  

[38;5;2m✔ Generated 10 parses as HTML[0m
../reports/evaluations


## Model v3 - Transfer Learning with `en_core_web_md`

In [25]:
%%bash -s "$spacy_train_file_path" "$spacy_dev_file_path" "$md_model_path"
python -m spacy debug-data en $1 $2 --base-model $3 --pipeline ner

[1m
[2K[38;5;2m✔ Corpus is loadable[0m
[1m
Training pipeline: ner
Starting with base model '../models/base_model_md'
160 training docs
40 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[1m
[38;5;4mℹ 14768 total words in the data (2755 unique)[0m
[38;5;4mℹ 20000 vectors (684830 unique keys, 300 dimensions)[0m
[38;5;3m⚠ 75 words in training data without vectors (0.01%)[0m
[1m
[38;5;4mℹ 0 new labels, 4 existing labels[0m
0 missing values (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities consisting of or starting/ending with punctuation[0m
[1m
[38;5;2m✔ 6 checks passed[0m


In [26]:
%%bash -s "$spacy_train_file_path" "$spacy_dev_file_path" "$md_model_path" "../models/v3"
python -m spacy train en $4 $1 $2 -b $3 -p ner

[38;5;2m✔ Created output directory: ../models/v3[0m
Training pipeline: ['ner']
Starting with base model '../models/base_model_md'
Extending component from base model 'ner'
Counting training words (limit=0)

Itn  NER Loss   NER P   NER R   NER F   Token %  CPU WPS
---  ---------  ------  ------  ------  -------  -------
  1   1418.108   0.000   0.000   0.000  100.000    19614
  2   1064.884  61.765  13.725  22.460  100.000    29505
  3    976.129  58.824  32.680  42.017  100.000    29090
  4    823.943  64.000  41.830  50.593  100.000    27963
  5    730.364  63.810  43.791  51.938  100.000    27426
  6    639.481  62.609  47.059  53.731  100.000    28939
  7    546.394  58.197  46.405  51.636  100.000    29730
  8    467.865  56.738  52.288  54.422  100.000    26055
  9    508.672  57.343  53.595  55.405  100.000    29452
 10    506.857  58.741  54.902  56.757  100.000    29517
 11    369.919  60.993  56.209  58.503  100.000    27793
 12    285.529  60.140  56.209  58.108  100.000   

                                                       

In [64]:
%%bash -s "$spacy_dev_file_path" "../models/v3/model-best" "../reports/evaluations/"
python -m spacy evaluate $2 $1 --displacy-path $3 --displacy-limit 10

[1m

Time      0.56 s
Words     4226  
Words/s   7585  
TOK       100.00
POS       0.00  
UAS       0.00  
LAS       0.00  
NER P     62.94 
NER R     58.82 
NER F     60.81 
Textcat   0.00  

[38;5;2m✔ Generated 10 parses as HTML[0m
../reports/evaluations


## Model v4 - Transfer Learning with `en_core_web_md` and 50 epochs

In [50]:
%%bash -s "$spacy_train_file_path" "$spacy_dev_file_path" "$md_model_path"
python -m spacy debug-data en $1 $2 --base-model $3 --pipeline ner

[1m
[2K[38;5;2m✔ Corpus is loadable[0m
[1m
Training pipeline: ner
Starting with base model '../models/base_model_md'
160 training docs
40 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[1m
[38;5;4mℹ 14768 total words in the data (2755 unique)[0m
[38;5;4mℹ 20000 vectors (684830 unique keys, 300 dimensions)[0m
[38;5;3m⚠ 75 words in training data without vectors (0.01%)[0m
[1m
[38;5;4mℹ 0 new labels, 4 existing labels[0m
0 missing values (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities consisting of or starting/ending with punctuation[0m
[1m
[38;5;2m✔ 6 checks passed[0m


In [51]:
%%bash -s "$spacy_train_file_path" "$spacy_dev_file_path" "$md_model_path" "../models/v4"
python -m spacy train en $4 $1 $2 -b $3 -p ner -n 50

[38;5;2m✔ Created output directory: ../models/v4[0m
Training pipeline: ['ner']
Starting with base model '../models/base_model_md'
Extending component from base model 'ner'
Counting training words (limit=0)

Itn  NER Loss   NER P   NER R   NER F   Token %  CPU WPS
---  ---------  ------  ------  ------  -------  -------
  1   1418.108   0.000   0.000   0.000  100.000    25167
  2   1064.884  61.765  13.725  22.460  100.000    25311
  3    976.129  58.824  32.680  42.017  100.000    29120
  4    823.943  64.000  41.830  50.593  100.000    27769
  5    730.364  63.810  43.791  51.938  100.000    29287
  6    639.481  62.609  47.059  53.731  100.000    28351
  7    546.394  58.197  46.405  51.636  100.000    29786
  8    467.865  56.738  52.288  54.422  100.000    27194
  9    508.672  57.343  53.595  55.405  100.000    28181
 10    506.857  58.741  54.902  56.757  100.000    28757
 11    369.919  60.993  56.209  58.503  100.000    29690
 12    285.529  60.140  56.209  58.108  100.000   

                                                       

In [65]:
%%bash -s "$spacy_dev_file_path" "../models/v4/model-best" "../reports/evaluations/"
python -m spacy evaluate $2 $1 --displacy-path $3 --displacy-limit 10

[1m

Time      0.47 s
Words     4226  
Words/s   9073  
TOK       100.00
POS       0.00  
UAS       0.00  
LAS       0.00  
NER P     62.76 
NER R     59.48 
NER F     61.07 
Textcat   0.00  

[38;5;2m✔ Generated 10 parses as HTML[0m
../reports/evaluations


## Evaluate Model (manually)

In [38]:
# Set up colors and options for displaCy
colors = {"FEATURE": "#80CBC4", "BEER_STYLE": "#FDD835", "LOCATION": "#C5CAE9", "BREWERY": "#AED581"}
options = {"colors": colors} 

In [54]:
# Load saved model
nlp = spacy.load("../models/v4/model-best")

In [55]:
# Test trained model with real data
with open('../data/raw/reviews_san-diego_half-door-brewing-co.txt') as file:
    reviews = file.readlines()
    
for review in reviews:
    doc = nlp(review)
    displacy.render(doc, style="ent", options=options)

In [56]:
# Test trained model
with open('../data/raw/reviews_san-diego_alesmith.txt') as file:
    reviews = file.readlines()
    
for review in reviews:
    doc = nlp(review)
    displacy.render(doc, style="ent", options=options)