# GA Capstone
## Classification Modeling

The goal here is to create a binary classification model that will classify text as being Shakespearian or not.

Much of the below is adapted from the [Hugging Face Text Classification Tutorial](https://huggingface.co/docs/transformers/tasks/sequence_classification) and the notebook linked therein.

### Imports and Preliminaries

In [2]:
# IMPORTS
# Datasets for dataset formatting
from datasets import Dataset, DatasetDict

# tokenizer and collator
from transformers import AutoTokenizer, DataCollatorWithPadding

# model and optimizer
from transformers import TFAutoModelForSequenceClassification, create_optimizer

# support
import numpy as np
import os
import re
import random
import json

from utilities.utilities import load_config, get_dataset_from_config
from utilities.utilities import train_test_val_split, split_text_and_labels

In [3]:
# load config from json
CONFIG_FILE = 'config.json'

config_vars = load_config(CONFIG_FILE)
config_vars

{'MODEL_DIR': '../models/',
 'DATA_DIR': '../data/',
 'CAUSAL_N_EPOCHS': 4,
 'CLASS_N_EPOCHS': 8,
 'BATCH_SIZE': 16,
 'CAUSAL_MODEL': 'distilgpt2',
 'CLASS_MODEL': 'distilbert-base-uncased',
 'MODEL_NAME': 'shakespeare',
 'DATA_SHAKESPEARE': ['shakespeare-sonnets.clean.txt', 'shakespeareplays.txt'],
 'DATA_OTHER': ['belloc_hilaire-sonnets_and_verse.clean.txt',
  'blake_william-poems.clean.txt',
  'browning_elizabeth-sonnets_from_the_portuguese.clean.txt',
  'daniel_samuel_and_constable_henry-elizabethan_sonnet_cycles.clean.txt',
  'donne_john-poetry_vol_1.clean.txt',
  'drayton_michael_et_al-elizabethan_sonnet_cycles.clean.txt',
  'farjeon_eleanor-sonnets_and_poems.clean.txt',
  'keats_john-poems_1820.clean.txt',
  'lodge_thomas_and_fletcher_giles-elizabethan_sonnet_cycles.clean.txt',
  'lovell_robert_and_southey_robert-poems.clean.txt',
  'milton_john-poetical_works.clean.txt',
  'seward_anna-sonnets-and-odes.clean.txt',
  'shelley_percy-complete_poetic_works.clean.txt',
  'wilde_osca

In [4]:
# pretrained model designator
MODEL_TYPE = config_vars['CLASS_MODEL'] if 'CLASS_MODEL' in config_vars else 'distilbert-base-uncased'

# model batch size
BATCH_SIZE = config_vars['BATCH_SIZE'] if 'BATCH_SIZE' in config_vars else 16

# model num epochs
N_EPOCHS = config_vars['CLASS_N_EPOCHS'] if 'CLASS_N_EPOCHS' in config_vars else 8

# whether to downsample
SAMPLE = config_vars['N_SAMPLES'] if 'N_SAMPLES' in config_vars else 1

In [5]:
# directories, etc.
MODEL_DIR = config_vars['MODEL_DIR'] if 'MODEL_DIR' in config_vars else '../models/'
MODEL_NAME = config_vars['MODEL_NAME'] if 'MODEL_NAME' in config_vars else 'shakespeare'
MODEL_FULL_PATH = os.path.join(MODEL_DIR, f'{MODEL_NAME}.{MODEL_TYPE}.{N_EPOCHS}')

DATA_DIR = config_vars['DATA_DIR'] if 'DATA_DIR' in config_vars else '../data/'

### Data Loading and Preparation

In [6]:
# load data and split into sentences
data = get_dataset_from_config(config_vars, limit=SAMPLE)
len(data[0]), data[0][:2], len(data[1]), data[1][:2]

(36809,
 [('Lift up your hearts in Gumber, laugh the Weald And you my mother the Valley of Arun sing.',
   0),
  ('Here am I homeward from my wandering Here am I homeward and my heart is healed.',
   0)],
 76578,
 [('From fairest creatures we desire increase, That thereby beauty’s rose might never die, But as the riper should by time decease, His tender heir might bear his memory:',
   1),
  ('But thou, contracted to thine own bright eyes, Feed’st thy light’s flame with self-substantial fuel, Making a famine where abundance lies, Thyself thy foe, to thy sweet self too cruel:',
   1)])

In [7]:
# create train-test-val-split
# adding a validation split for the NN model
data_ttvs = train_test_val_split(data)
data_ttvs['train'][:2], data_ttvs['test'][:2], data_ttvs['val'][:2]

([('I would they could.', 1), ('Farewell therefore, Hero.', 1)],
 [('I will bestow a breakfast to make you friends, and we’ll be all three sworn brothers to France.',
   1),
  ('Since, Jupiter, our son is good, Take off his miseries.', 1)],
 [('Some six months since, my lord.', 1), ('Brought you Caesar home?', 1)])

In [8]:
# format into DatasetDict format
train_data = Dataset.from_dict(split_text_and_labels(data_ttvs['train']))
test_data = Dataset.from_dict(split_text_and_labels(data_ttvs['test']))
val_data = Dataset.from_dict(split_text_and_labels(data_ttvs['val']))
dataset = DatasetDict({'train': train_data, 'test': test_data, 'val': val_data})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 100917
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 11337
    })
    val: Dataset({
        features: ['text', 'labels'],
        num_rows: 1133
    })
})

### Tokenization and Prepping Collator

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)

def tokenizer_func(text):
    return tokenizer(text['text'])

In [10]:
tokenized_data = dataset.map(tokenizer_func, batched=True)

  0%|          | 0/101 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [11]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

### Modeling

In [12]:
# instantiate model
if os.path.exists(MODEL_FULL_PATH):
    model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_FULL_PATH)
else:
    model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_TYPE)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [13]:
# prep train and test sets for model
tf_train_set = model.prepare_tf_dataset(
    tokenized_data['train'],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=collator
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_data['test'],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=collator
)

tf_val_set = model.prepare_tf_dataset(
    tokenized_data['val'],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=collator
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
# set up optimizer
batches_per_epoch = len(tokenized_data['train']) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * N_EPOCHS)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [15]:
# compile and fit model
if not os.path.exists(MODEL_FULL_PATH):
    model.compile(optimizer=optimizer)
    model.fit(tf_train_set, validation_data=tf_val_set, epochs = N_EPOCHS)
    os.makedirs(MODEL_FULL_PATH)
    model.save_pretrained(MODEL_FULL_PATH)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
 710/6307 [==>...........................] - ETA: 1:34:34 - loss: 0.0299

KeyboardInterrupt: 

In [16]:
os.makedirs(MODEL_FULL_PATH)
model.save_pretrained(MODEL_FULL_PATH)

### Test

In [None]:
def get_class_from_output(output):
    return np.argmax(output.logits, axis=1)

def get_probs_from_output(output, c=1):
    logits = output.logits
    return (np.exp(logits) / (1 + np.exp(logits)))[:,c]

NUM_TESTS = 10

test_text = test_data['text'][:NUM_TESTS]
test_class = test_data['labels'][:NUM_TESTS]

tests_tokened = tokenizer(test_text, return_tensors='tf', padding=True)
outputs = model(tests_tokened)
pred_class = get_class_from_output(outputs)
pred_prob = get_probs_from_output(outputs)

list(zip(test_text, test_class, pred_class, pred_prob))

### Conclusion

Classification works pretty well! Woohoo!