# Text Classification
An ML notebook for training a model for text classification.

### Imports

In [11]:
import random
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from spacy.training import (
    offsets_to_biluo_tags,
    biluo_to_iob
)
from utilities import (
    jsonl_to_list,
    safe_make_dir,
    split_train_val_test
)

# Force spaCy to run on the GPU
spacy.require_gpu()

True

### Load the Training Data & Split
You can generate a fresh training dataset with the `step_1_generate_training_data.ipynb` notebook.

In [12]:
# Load the data
data = jsonl_to_list('./data/training.jsonl')
# Randomly shuffle the data
random.shuffle(data)

# Split the data into training, dev, and test sets
train, dev, test = split_train_val_test(data)

### Convert the datasets into DocBins
Spacy cannot directly read the raw data in this format, and so we have to convert to a binary format that Spacy can work with, using the DocBin structure:

In [13]:
# Safe make the training directory
safe_make_dir('./training')

# Load the categories
categories = jsonl_to_list('./data/text_categories.jsonl')
categories = list(map(lambda x: x['label'], categories))

def convert(data, outfile):
    nlp = spacy.blank('en')
    db = DocBin()
    for item in data:
        doc = nlp.make_doc(item['text'])
        doc.cats = {category: 0 for category in categories}
        doc.cats[item['label']] = 1
        db.add(doc)
    db.to_disk(outfile)

convert(train, './training/train.spacy')
convert(dev, './training/dev.spacy')
convert(test, './training/test.spacy')

Ignoring: Directory ./training already exists


### Generate Config File for Model Training

In [14]:
%run -m spacy init config --pipeline textcat ./training/config.cfg

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
training/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Train the Model

In [15]:
%run -m spacy train ./training/config.cfg --paths.train ./training/train.spacy  --paths.dev ./training/dev.spacy --output ./training/textcat_model

[38;5;2m✔ Created output directory: training/textcat_model[0m
[38;5;4mℹ Saving to output directory: training/textcat_model[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.07       51.85    0.52
  0     200         12.55       99.75    1.00
  1     400          8.54      100.00    1.00
  1     600          4.61      100.00    1.00
  2     800          1.94      100.00    1.00
  3    1000          0.69      100.00    1.00
  5    1200          0.26      100.00    1.00
  6    1400          0.14      100.00    1.00
  8    1600          0.08      100.00    1.00
 11    1800          0.05      100.00    1.00
 14    2000          0.04      100.00    1.00
[38;5;2m✔ Saved pipeline to output directory[0m
training/te

### Model Evaluation

In [16]:
%run -m spacy evaluate ./training/textcat_model/model-best/ --output ./training/metrics.json ./training/test.spacy

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK                 100.00
TEXTCAT (macro F)   100.00
SPEED               158893

[1m

                  P        R        F
ISOLATE      100.00   100.00   100.00
HIDE         100.00   100.00   100.00
QUANTIFY     100.00   100.00   100.00
ZOOM_IN      100.00   100.00   100.00
PAN_RIGHT    100.00   100.00   100.00
PAN_LEFT     100.00   100.00   100.00
LOOK_LEFT    100.00   100.00   100.00
PAN_UP       100.00   100.00   100.00
PAN_DOWN     100.00   100.00   100.00
LOOK_DOWN    100.00   100.00   100.00
ZOOM_OUT     100.00   100.00   100.00
LOOK_UP      100.00   100.00   100.00
LOOK_RIGHT   100.00   100.00   100.00

[1m

             ROC AUC
ISOLATE         1.00
HIDE            1.00
QUANTIFY        1.00
ZOOM_IN         1.00
PAN_RIGHT       1.00
PAN_LEFT        1.00
LOOK_LEFT       1.00
PAN_UP          1.00
PAN_DOWN        1.00
LOOK_DOWN       1.00
ZOOM_OUT        1.00
LOOK_UP         1.00
LOOK_RI

### Load the Model

In [17]:
nlp = spacy.load('./training/textcat_model/model-best')

In [None]:
text = "Show me all single pole light switches and all other electrical equipment that James Bond installed in the kitchen area on Level 1 from May to June."
doc = nlp(text)
print(doc.cats)

In [None]:
text = "Hide the electrical equipment."
doc = nlp(text)
print(doc.cats)