## **03 Train Model**
This notebook takes the processed reviews and trains a spaCy NLP named entity recognition (NER) model to identify brewery features from user review text.

### **Notebook Objectives**
1. Generate a training dataset from the review data compiled in the previous notebook
2. Label the features in the training data using Docanno
3. Import the annotated training data into the spaCy pipeline
4. Train the NER model
5. Evaluate the model

In [29]:
from pathlib import Path
import random
import math
import pandas as pd
import numpy as np
import json
from collections import Counter
random.seed(11)

import spacy
from spacy.tokens import DocBin
from spacy import displacy

In [2]:
# Import brewery review data into dataframe
filepath = Path('../assets/brewery_reviews_R1.csv') 
reviews_df = pd.read_csv(filepath)
reviews_df.head()

Unnamed: 0,name,rating,review
0,10th_District_Brewing_Company,5.0,"Good Local Beer!. Good beer, brewed right on s..."
1,10th_District_Brewing_Company,5.0,Micro Brewery with diverse collections . This ...
2,10th_District_Brewing_Company,5.0,Great Beer. I come here often for some good be...
3,10th_District_Brewing_Company,4.0,Nice local SE MASS Brewery. Definitely worth t...
4,10th_District_Brewing_Company,4.0,Great local micro brewery. The beer here is ve...


In [3]:
# Extract random sets of reviews for training data
# Incrementally retrain the model as more training sets are annotated
reviews = reviews_df['review'].to_numpy()
set_size = 100
total_sets = 6
np.random.seed(12)
random_reviews = np.random.choice(reviews, size=set_size*total_sets, replace=False)
reviews_split = np.array_split(random_reviews, total_sets)
for num in range(total_sets):
    filepath = Path(f'../assets/data/training_data_{set_size}_{num}.txt')
    with open(filepath, 'w+') as f:
        f.write('\n'.join(reviews_split[num]))

In [18]:
# Load the training data from the Doccano export
filepath = Path("../assets/data/processed/training_100_0.jsonl")
labeled_data = []

with open(filepath) as file:
    lines = file.readlines()
    for train_data in lines:
        data = json.loads(train_data)
        if len(data['label']) == 0: continue
        labeled_data.append((data['text'], data['label']))

In [19]:
# Split the labeled data into training and evaluation sets
random.shuffle(labeled_data)
split = 0.8
n = math.floor(len(labeled_data) * split)
train_data = labeled_data[:n]
eval_data = labeled_data[n:]

# Inspect split point
if train_data[-1] == eval_data[0]:
    print('Integer rounding error. Overlap between training and eval data')

In [20]:
# Prepare the training and evaluation data for spacy
def create_docbin(data):
    nlp = spacy.blank("en")
    db = DocBin()
    for text, annotations in data:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        if ents != []:
            doc.ents = ents
        db.add(doc)
    return db

train_db = create_docbin(train_data)
eval_db = create_docbin(eval_data)

# convert to spacy format
train_db.to_disk(Path("../models/train.spacy"))
eval_db.to_disk(Path("../models/eval.spacy"))

In [25]:
%%bash
# Train model
python -m spacy train ../configs/config.cfg --output ../models --paths.train ../models/train.spacy --paths.dev ../models/eval.spacy

[38;5;2m✔ Created output directory: models[0m
[38;5;4mℹ Saving to output directory: models[0m
[38;5;4mℹ Using CPU[0m
[1m


[2023-01-31 18:20:13,573] [INFO] Set up nlp object from config
[2023-01-31 18:20:13,577] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-01-31 18:20:13,579] [INFO] Created vocabulary
[2023-01-31 18:20:13,580] [INFO] Finished initializing nlp object
[2023-01-31 18:20:13,992] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     94.50    0.00    0.00    0.00    0.00
  3     200        285.22   2378.65   48.82   59.62   41.33    0.49
  7     400        149.21    560.12   56.52   61.90   52.00    0.57
 11     600        233.19    247.70   54.05   54.79   53.33    0.54
 17     800       3070.56    175.84   56.52   61.90   52.00    0.57
 23    1000        156.21     80.28   60.29   67.21   54.67    0.60
 31    1200        155.04     63.41   54.41   60.66   49.33    0.54
 41    1400         78.43     22.06   53.15   55.88   50.67    0.53
 52    1600         71.58     23.18   59.70   67.80   53.33    0.60
 66    1800        202.82     49.28   54.55   63.16   48.00    0.55
 84    2000        283.49     49.87   54.93   58.21   52.0

In [26]:
# Test trained model on example text
nlp = spacy.load(Path('../models/model-best'))
doc = nlp("The Jacks Abbey of Hopkinton . Great atmosphere and beer selection. After renovations it’s spacious and serve food as well. You order your food separate from your beer. Two separate stations. Food is OK. My fish and chips had dry-over-fried chips, and “healthy” tasting coleslaw but fish was good. ")
print(doc.ents)
for ent in doc.ents:
    print(ent.label_, ent.text)

(Hopkinton, food, food, Food)
SETTING Hopkinton
FEATURE food
FEATURE food
FEATURE Food


In [27]:
# Test trained model on one brewery review set
nlp = spacy.load(Path('../models/model-best'))
# filename = Path('../assets/text/Stone_Cow_Brewery-g41454-d10847344-Reviews-Stone_Cow_Brewery-Barre_Massachusetts.txt')
# filename = Path('../assets/text/Lamplighter_Brewing_Co.-g60890-d11884912-Reviews-Lamplighter_Brewing_Company-Cambridge_Massachusetts.txt')
filename = Path('../assets/text/Tree_House_Brewery-g41495-d12700065-Reviews-Tree_House_Brewing_Company-Charlton_Massachusetts.txt')

with open(filename) as f:
    reviews = f.read()

# Extract entity matches in lists
doc = nlp(reviews)
setting = []
feature = []
beer = []
ux = []
for ent in doc.ents:
    if ent.label_ == 'SETTING':
        setting.append(ent.text)
    elif ent.label_ == 'FEATURE':
        feature.append(ent.text)
    elif ent.label_ == 'BEER':
        beer.append(ent.text)
    elif ent.label_ == 'UX':
        ux.append(ent.text)

print(setting, feature, beer, ux)

['cult-like following', 'Green', 'frustrating', 'relaxed', 'Boston', 'Relaxing', 'Crisp', '*', 'facility.-', 'Main Street'] ['beautiful', 'outdoor', 'pizza', 'organized', 'cans only', 'outside', 'cans according to', 'food', 'seltzer', 'coffee', 'outside', 'coffee', 'Outdoor', 'Live music', 'food', 'cans (around', 'cans),', 'cans that day', 'Parking', 'beautiful', 'system', 'organized', 'beautiful', 'coffee', 'outside', 'nuts', 'Pulling', 'outside', 'cans to go', 'cans only because', 'Realize', 'Stouts', 'coffee', 'Jan', 'Refreshing', 'Satisfying', 'DIPAs', 'food', 'food', 'Lots of beer', 'dogs', 'dog', 'Beautiful', 'organized', 'food', 'dog', 'food', 'dog', 'brand', 'Food trucks', 'cans of beer', 'stouts', 'cans per customer', 'Cans', 'dogs', 'ice', 'outside', 'growlers', 'band', 'outdoor', 'live band', 'music', 'stand', 'outside', 'cans to take', 'cans. But', 'stouts', 'stouts'] ['IPAs', 'IPAs', 'New England', 'YES', 'IPA', 'Stout', 'IPA', 'IPA', 'APAs', 'IPAs', 'IPA', 'New England', 

In [30]:
# Get entity match counts
feature = [s.lower() for s in feature]
setting = [s.lower() for s in setting]
beer = [s.lower() for s in beer]
ux = [s.lower() for s in ux]

for label in [feature, setting, beer, ux]:
    c = Counter(label)
    print(c)

Counter({'outside': 6, 'food': 6, 'beautiful': 4, 'coffee': 4, 'stouts': 4, 'outdoor': 3, 'organized': 3, 'dog': 3, 'dogs': 2, 'pizza': 1, 'cans only': 1, 'cans according to': 1, 'seltzer': 1, 'live music': 1, 'cans (around': 1, 'cans),': 1, 'cans that day': 1, 'parking': 1, 'system': 1, 'nuts': 1, 'pulling': 1, 'cans to go': 1, 'cans only because': 1, 'realize': 1, 'jan': 1, 'refreshing': 1, 'satisfying': 1, 'dipas': 1, 'lots of beer': 1, 'brand': 1, 'food trucks': 1, 'cans of beer': 1, 'cans per customer': 1, 'cans': 1, 'ice': 1, 'growlers': 1, 'band': 1, 'live band': 1, 'music': 1, 'stand': 1, 'cans to take': 1, 'cans. but': 1})
Counter({'cult-like following': 1, 'green': 1, 'frustrating': 1, 'relaxed': 1, 'boston': 1, 'relaxing': 1, 'crisp': 1, '*': 1, 'facility.-': 1, 'main street': 1})
Counter({'ipas': 6, 'ipa': 5, 'new england': 2, 'yes': 1, 'stout': 1, 'apas': 1, 'dipas': 1, 'esb': 1, 'new england ipas': 1, 'cans': 1})
Counter({'friendly': 1, 'helpful': 1, 'knowledgeable': 1})


In [31]:
# Verify POS tagger with base model
# TODO combine base model knowledge with custom model
nlp = spacy.load("en_core_web_sm")
doc = nlp("The Jacks Abbey of Hopkinton . Great atmosphere and beer selection. After renovations it’s spacious and serve food as well. You order your food separate from your beer. Two separate stations. Food is OK. My fish and chips had dry-over-fried chips, and “healthy” tasting coleslaw but fish was good. ")

for ent in doc.ents[:10]:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

for token in doc[:10]:
    print(token.text, token.pos_)

The Jacks Abbey 0 15 ORG
Hopkinton 19 28 GPE
Two 169 172 CARDINAL
The DET
Jacks PROPN
Abbey PROPN
of ADP
Hopkinton PROPN
. PUNCT
Great ADJ
atmosphere NOUN
and CCONJ
beer NOUN


In [36]:
# Test displacy for model evaluation
nlp = spacy.load(Path('../models/model-best'))
with open('../assets/text/Aeronaut_Brewing_Company-g41818-d8126258-Reviews-Aeronaut_Brewing_Co-Somerville_Massachusetts.txt') as file:
    reviews = file.readlines()
    
for review in reviews[10:20]:
    doc = nlp(review)
    displacy.render(doc, style="ent")