# Text Classification
An ML notebook for training a model for text classification.

### Imports

In [None]:
import random
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from spacy.training import (
    offsets_to_biluo_tags,
    biluo_to_iob
)
from utilities import (
    jsonl_to_list,
    split_train_val_test
)

# Force spaCy to run on the GPU
spacy.require_gpu()

# Load the Natural Language Pipeline
# nlp = spacy.load('en_core_web_trf')

### Load the Training Data & Split
You can generate a fresh training dataset with the `training_data.ipynb` notebook.

In [None]:
# Load the data
data = jsonl_to_list('./data/training.jsonl')
# Randomly shuffle the data
random.shuffle(data)

# Split the data into training, dev, and test sets
train, dev, test = split_train_val_test(data)

### Convert the datasets into DocBins
Spacy cannot directly read the raw data in this format, and so we have to convert to a binary format that Spacy can work with, using the DocBin structure:

In [None]:
# Safe make the training directory
safe_make_dir('./training')

# Load the categories
categories = jsonl_to_list('./data/text_categories.jsonl')
categories = list(map(lambda x: x['label'], categories))

def convert(data, outfile):
    nlp = spacy.blank('en')
    db = DocBin()
    for item in data:
        doc = nlp.make_doc(item['text'])
        doc.cats = {category: 0 for category in categories}
        doc.cats[item['label']] = 1
        db.add(doc)
    db.to_disk(outfile)

convert(train, './training/train.spacy')
convert(dev, './training/dev.spacy')
convert(test, './training/test.spacy')

In [None]:
nlp = spacy.load('./training/textcat_model/model-best')

In [None]:
text = "Show me all single pole light switches and all other electrical equipment that James Bond installed in the kitchen area on Level 1 from May to June."
doc = nlp(text)
print(doc.cats)

In [None]:
text = "Hide the electrical equipment."
doc = nlp(text)
print(doc.cats)