# Training Sentiment Classifier Model from a pre-trained Transformer Language Model

Install necessary libraries via command line
```
pip install spacy
pip install spacy-transformers
python -m spacy download en_trf_bertbaseuncased_lg
```

## Import pre-trained language model

In [2]:
import spacy
from spacy.util import minibatch
import en_trf_bertbaseuncased_lg
import random
import torch
    
nlp = spacy.load("en_trf_bertbaseuncased_lg")

In [14]:
# Explore text similarity feature of pre-trained language model

text_1 = nlp("Dangerous virus has been spreading across the world amid health concerns.")
text_2 = nlp("Health authoirities warned about the virus in January.")
text_3 = nlp("Computer virus can be bad for economy as hacking cases increases across the world.")
print(text_1[1].similarity(text_2[5]))
print(text_1[1].similarity(text_3[1]))

0.8573929
0.7108537


## Data preparation

In [8]:
import pandas as pd
from helper_functions import preprocess, remove_emojis

df = pd.read_csv('data/tweets/tweets_labeled.csv', encoding='utf-8', )
df = df[['id', 'label', 'text']]
df = df[df['label'].notnull()]
print(df.shape)
df['text'] = df['text'].replace(r'\n',' ',regex=True)
df['text'] = df['text'].apply(preprocess)
df['text'] = df['text'].str.lower()

df['label'] = df['label'].astype(int).astype(str)

df['label'] = df['label'].map({'-1': 'neg', '0': 'neu', '1': 'pos'})

X = df[['text', 'id']]
y = df[['label']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                stratify=y, 
                                                test_size=0.20)

df_validation = pd.concat([X_test, y_test], axis=1)[['id', 'label', 'text']].reset_index(drop=True)
df_train = pd.concat([X_train, y_train], axis=1)[['id', 'label', 'text']].reset_index(drop=True)

print(df_train.label.value_counts(normalize=True))
print(df_validation.label.value_counts(normalize=True))

# df_train.to_csv('train.tsv', encoding='utf-8', index=False, sep='\t')
# df_validation.to_csv('test.tsv', encoding='utf-8', index=False, sep='\t')

(1358, 3)
neg    0.480663
neu    0.281768
pos    0.237569
Name: label, dtype: float64
neg    0.481618
neu    0.279412
pos    0.238971
Name: label, dtype: float64


In [9]:
# Convert train data and test data to a format where model expects
records = pd.get_dummies(df_train[['label', 'text']], columns=['label'], dtype='float').to_dict(orient='records')
TRAIN_DATA = []
for record in records:
    cats = record.copy()
    cats.pop('text')
    train_record = (record.get('text'), {"cats": cats})
    TRAIN_DATA.append(train_record)
    
records = pd.get_dummies(df_validation[['label', 'text']], columns=['label'], dtype='float').to_dict(orient='records')
EVAL_DATA = []
for record in records:
    cats = record.copy()
    cats.pop('text')
    eval_record = (record.get('text'), {"cats": cats})
    EVAL_DATA.append(eval_record)

## Training sentiment classification model

### Define functions

In [8]:
import re
import random
import json
from pathlib import Path
from collections import Counter
import spacy
import torch
from spacy.util import minibatch
import tqdm
import unicodedata
import wasabi
from spacy_transformers.util import cyclic_triangular_rate
import en_trf_bertbaseuncased_lg


def read_inputs(train_data):
    texts = []
    cats = []
    for line in train_data:
        text, gold = line
        text = preprocess_text(text)
        texts.append(text)
        cats.append(gold["cats"])
    return texts, cats


def make_sentence_examples(nlp, texts, labels):
    """Treat each sentence of the document as an instance, using the doc labels."""
    sents = []
    sent_cats = []
    for text, cats in zip(texts, labels):
        doc = nlp.make_doc(text)
        doc = nlp.get_pipe("sentencizer")(doc)
        for sent in doc.sents:
            sents.append(sent.text)
            sent_cats.append(cats)
    return sents, sent_cats


white_re = re.compile(r"\s\s+")


def preprocess_text(text):
    text = text.replace("<s>", "<open-s-tag>")
    text = text.replace("</s>", "<close-s-tag>")
    text = white_re.sub(" ", text).strip()
    return "".join(
        c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn"
    )


def evaluate(nlp, texts, cats, pos_label):
    tp = 0.0  # True positives
    fp = 0.0  # False positives
    fn = 0.0  # False negatives
    tn = 0.0  # True negatives
    total_words = sum(len(text.split()) for text in texts)
    with tqdm.tqdm(total=total_words, leave=False) as pbar:
        for i, doc in enumerate(nlp.pipe(texts, batch_size=8)):
            gold = cats[i]
            for label, score in doc.cats.items():
                if label not in gold:
                    continue
                if label != pos_label:
                    continue
                if score >= 0.5 and gold[label] >= 0.5:
                    tp += 1.0
                elif score >= 0.5 and gold[label] < 0.5:
                    fp += 1.0
                elif score < 0.5 and gold[label] < 0.5:
                    tn += 1
                elif score < 0.5 and gold[label] >= 0.5:
                    fn += 1
            pbar.update(len(doc.text.split()))
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


def train_model(
    model="en_trf_bertbaseuncased_lg",
    train_data=None,
    eval_data=None,
    output_dir=None,
    n_iter=5,
    n_texts=100,
    batch_size=8,
    learn_rate=2e-5,
    max_wpb=1000,
    use_test=False,
    pos_label=None,
):
    spacy.util.fix_random_seed(0)
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    try:
        nlp = spacy.load(model)
    except:
        nlp = en_trf_bertbaseuncased_lg.load()
        
    print(nlp.pipe_names)
    print(f"Loaded model '{model}'")
    textcat = nlp.create_pipe(
        "trf_textcat",
        config={"architecture": "softmax_last_hidden", "words_per_batch": max_wpb},
    )
    
    train_texts, train_cats = read_inputs(train_data)
    eval_texts, eval_cats = read_inputs(eval_data)
    labels = set()
    for cats in train_cats + eval_cats:
        labels.update(cats)
    # use the first label in the set as the positive label if one isn't
    # provided
    for label in sorted(labels):
        if not pos_label:
            pos_label = label
        textcat.add_label(label)


    print("Labels:", textcat.labels)
    print("Positive label for evaluation:", pos_label)
    nlp.add_pipe(textcat, last=True)
    print(f"Using {len(train_texts)} training docs, {len(eval_texts)} evaluation")
    split_training_by_sentence = False
    if split_training_by_sentence:
        # If we're using a model that averages over sentence predictions (we are),
        # there are some advantages to just labelling each sentence as an example.
        # It means we can mix the sentences into different batches, so we can make
        # more frequent updates. It also changes the loss somewhat, in a way that's
        # not obviously better -- but it does seem to work well.
        train_texts, train_cats = make_sentence_examples(nlp, train_texts, train_cats)
        print(f"Extracted {len(train_texts)} training sents")
    # total_words = sum(len(text.split()) for text in train_texts)
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
    # Initialize the TextCategorizer, and create an optimizer.
    optimizer = nlp.resume_training()
    optimizer.alpha = 0.001
    optimizer.trf_weight_decay = 0.005
    optimizer.L2 = 0.0
    learn_rates = cyclic_triangular_rate(
        learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
    )
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))

    pbar = tqdm.tqdm(total=100, leave=False)
    results = []
    epoch = 0
    step = 0
    eval_every = 100
    patience = 3
    while True:
        # Train and evaluate
        losses = Counter()
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_size)
        for batch in batches:
            optimizer.trf_lr = next(learn_rates)
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
            pbar.update(1)
            if step and (step % eval_every) == 0:
                pbar.close()
                with nlp.use_params(optimizer.averages):
                    scores = evaluate(nlp, eval_texts, eval_cats, pos_label)
                results.append((scores["textcat_f"], step, epoch))
                print(
                    "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
                        losses["trf_textcat"],
                        scores["textcat_p"],
                        scores["textcat_r"],
                        scores["textcat_f"],
                    )
                )
                pbar = tqdm.tqdm(total=eval_every, leave=False)
            step += 1
        epoch += 1
        # Stop if no improvement in HP.patience checkpoints
        if results:
            best_score, best_step, best_epoch = max(results)
            if ((step - best_step) // eval_every) >= patience:
                break

    msg = wasabi.Printer()
    table_widths = [2, 4, 6]
    msg.info(f"Best scoring checkpoints")
    msg.row(["Epoch", "Step", "Score"], widths=table_widths)
    msg.row(["-" * width for width in table_widths])
    for score, step, epoch in sorted(results, reverse=True)[:10]:
        msg.row([epoch, step, "%.2f" % (score * 100)], widths=table_widths)

    # Test the trained model
    test_text = eval_texts[0]
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)


### Execute training job

In [10]:
params = {
    "model": "en_trf_bertbaseuncased_lg",
    "train_data": TRAIN_DATA,
    "eval_data": EVAL_DATA,
    "output_dir": "models/sentiment-classifier",
    "use_test": False,
    "batch_size": 8,
    "learn_rate": 2e-5,
    "max_wpb": 1000,
    "n_texts": 100,
    "n_iter": 5,
    "pos_label": "label_pos",
}


train_model(**params)

  0%|          | 0/100 [00:00<?, ?it/s]

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']
Loaded model 'en_trf_bertbaseuncased_lg'
Labels: ('label_neg', 'label_neu', 'label_pos')
Positive label for evaluation: label_pos
Using 1086 training docs, 272 evaluation
Training the model...
LOSS 	  P  	  R  	  F  


  0%|          | 0/100 [00:00<?, ?it/s]             

0.726	0.771	0.831	0.800


  0%|          | 0/100 [00:00<?, ?it/s]             

0.177	0.831	0.831	0.831


  0%|          | 0/100 [00:00<?, ?it/s]             

0.051	0.775	0.846	0.809


  0%|          | 0/100 [00:00<?, ?it/s]             

0.221	0.818	0.831	0.824


  0%|          | 0/100 [00:00<?, ?it/s]             

0.019	0.812	0.862	0.836


 15%|█▌        | 15/100 [01:10<06:26,  4.55s/it]

[38;5;4mℹ Best scoring checkpoints[0m
Epoch   Step   Score 
--   ----   ------
3    500    84.62 
4    600    83.58 
1    200    83.08 
5    700    82.71 
2    400    82.44 
5    800    82.09 
2    300    80.88 
0    100    80.00 
tributes paid to the muchloved amp compassionate bury gp dr saad aldubbaisi who has died with coronavirus the iraqiborn gp graduated from the university of baghdad amp had worked in the bury area for 20 years rest in peace {'label_neg': 2.5065755835385062e-05, 'label_neu': 3.924364864360541e-05, 'label_pos': 0.9999356269836426}
Saved model to /home/jupyter/projects/sentiment-classifier
Loading from /home/jupyter/projects/sentiment-classifier
tributes paid to the muchloved amp compassionate bury gp dr saad aldubbaisi who has died with coronavirus the iraqiborn gp graduated from the university of baghdad amp had worked in the bury area for 20 years rest in peace {'label_neg': 2.5065755835385062e-05, 'label_neu': 3.924364864360541e-05, 'label_pos': 0.999935626

### Review model output

In [10]:
import re
import torch
import unicodedata

# Review model output

def read_inputs(train_data):
    texts = []
    cats = []
    for line in train_data:
        text, gold = line
        text = preprocess_text(text)
        texts.append(text)
        cats.append(gold["cats"])
    return texts, cats


white_re = re.compile(r"\s\s+")


def preprocess_text(text):
    text = text.replace("<s>", "<open-s-tag>")
    text = text.replace("</s>", "<close-s-tag>")
    text = white_re.sub(" ", text).strip()
    return "".join(
        c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn"
    )

output_dir =  "models/sentiment-classifier"
eval_texts, eval_cats = read_inputs(EVAL_DATA)
i = 0 # Choose a random number to test
test_text = eval_texts[i] 

In [11]:
import spacy

nlp = spacy.load(output_dir)
doc = nlp(test_text)
print(test_text, doc.cats)
print('\n')

hundreds of ventilators the uk govt bought from china to relieve a major shortage during the covid19 pandemic are the wrong type and could kill patients senior doctors have warned {'label_neg': 0.9998445510864258, 'label_neu': 0.00012154843716416508, 'label_pos': 3.384325827937573e-05}




In [8]:
# Test the trained model with text of your choice
test_text = "There are some good news coming as coronavirus vaccine research advances."
doc = nlp(test_text)
print(test_text, doc.cats)

There are some good news coming as coronavirus vaccine research advances. {'label_neg': 4.7637462557759136e-05, 'label_neu': 0.00017945301078725606, 'label_pos': 0.9997729659080505}


In [9]:
# Test the trained model with text of your choice
test_text = "There are some bad news coming as coronavirus vaccine research fails."
doc = nlp(test_text)
print(test_text, doc.cats)

There are some bad news coming as coronavirus vaccine research fails. {'label_neg': 0.9993046522140503, 'label_neu': 0.0006404068553820252, 'label_pos': 5.4913893109187484e-05}
