## Imports and Preliminaries

In [19]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification
from transformers import AdamWeightDecay, create_optimizer
from datasets import Dataset, DatasetDict

import os
import re

In [2]:
model_type = 'albert-base-v2'

DIR_DATA = '../data/'

In [3]:
text_file_path = os.path.join(DIR_DATA, 'shakespeare-sonnets.clean.txt')
with open(text_file_path, 'r') as f:
    lines = [line.strip() for line in f.readlines() if line.strip()]

lines = lines[:-1]
len(lines), lines[:4]

(2154,
 ['From fairest creatures we desire increase,',
  'That thereby beautys rose might never die,',
  'But as the riper should by time decease,',
  'His tender heir might bear his memory:'])

## Preprocessing

In [4]:
lines1 = lines[0::2]
lines2 = lines[1::2]
len(lines1), len(lines2)

(1077, 1077)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_type)
tokenizer.eos_token = "[EOS]"
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(*lines[:2])

In [6]:
inputs

{'input_ids': [2, 37, 1768, 1430, 6733, 95, 3150, 1839, 15, 3, 30, 8535, 3679, 18, 1092, 530, 243, 1327, 15, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

['[CLS]', '▁from', '▁fair', 'est', '▁creatures', '▁we', '▁desire', '▁increase', ',', '[SEP]', '▁that', '▁thereby', '▁beauty', 's', '▁rose', '▁might', '▁never', '▁die', ',', '[SEP]']


In [9]:
split = -4
lines_train = {'l1': lines1[:split], 'l2': lines2[:split]}
lines_test = {'l1': lines1[split:], 'l2': lines2[split:]}
lines_test

{'l1': ['Which many legions of true hearts had warmed;',
  'Was, sleeping, by a virgin hand disarmed.',
  'Which from Loves fire took heat perpetual,',
  'For men diseased; but I, my mistress thrall,'],
 'l2': ['And so the general of hot desire',
  'This brand she quenched in a cool well by,',
  'Growing a bath and healthful remedy,',
  'Came there for cure and this by that I prove,']}

In [10]:
# create Dataset and DatasetDict instances - I think this is needed for model
train_dataset = Dataset.from_dict(lines_train)
test_dataset = Dataset.from_dict(lines_test)
datasets = DatasetDict({'train': train_dataset, 'test': test_dataset})
datasets

DatasetDict({
    train: Dataset({
        features: ['l1', 'l2'],
        num_rows: 1073
    })
    test: Dataset({
        features: ['l1', 'l2'],
        num_rows: 4
    })
})

In [11]:
def tokenize_func(text):
    return tokenizer(text['l1'], text['l2'], return_tensors='np')

In [14]:
tokened_data = datasets.map(tokenize_func, batched=True, remove_columns=['l1','l2'])
print(tokenizer.convert_ids_to_tokens(tokened_data['train'][0]['input_ids']))

  0%|          | 0/2 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


  0%|          | 0/1 [00:00<?, ?ba/s]

['[CLS]', '▁from', '▁fair', 'est', '▁creatures', '▁we', '▁desire', '▁increase', ',', '[SEP]', '▁that', '▁thereby', '▁beauty', 's', '▁rose', '▁might', '▁never', '▁die', ',', '[SEP]']


## Modeling

In [17]:
collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')
def collate_func(text):
    return collator(text)

collated_data = tokened_data.map(collate_func, batched=True)
for data in collated_data['train'][0].values():
    print(data)
    
print(tokenizer.convert_ids_to_tokens(collated_data['train'][0]['input_ids']))

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

[2, 37, 1768, 1430, 6733, 95, 3150, 1839, 15, 3, 30, 8535, 3679, 18, 1092, 530, 243, 1327, 15, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['[CLS]', '▁from', '▁fair', 'est', '▁creatures', '▁we', '▁desire', '▁increase', ',', '[SEP]', '▁that', '▁thereby', '▁beauty', 's', '▁rose', '▁might', '▁never', '▁die', ',', '[SEP]', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>']


In [23]:
# adapted from https://huggingface.co/docs/transformers/tasks/sequence_classification#train
batch_size = 16
num_epochs = 4
batches_per_epoch = len(tokened_data['train']) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
model = TFAutoModelForSequenceClassification.from_pretrained(model_type, num_labels=2)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
model.compile(optimizer=optimizer)

All model checkpoint layers were used when initializing TFAlbertForSequenceClassification.

Some layers of TFAlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [24]:
# convert data to special format for tf model

tf_train_set = model.prepare_tf_dataset(tokened_data['train'], shuffle=True, collate_fn=collator)
tf_test_set = model.prepare_tf_dataset(tokened_data['test'], shuffle=False, collate_fn=collator)
tf_train_set

<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(8, None), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(8, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(8, None), dtype=tf.int64, name=None)}>

In [25]:
model.fit(tf_train_set, validation_data=tf_test_set, epochs=4)

Epoch 1/4


TypeError: in user code:

    File "/Users/ffomezolam/.pyenv/versions/3.10.4/envs/transformers/lib/python3.10/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/Users/ffomezolam/.pyenv/versions/3.10.4/envs/transformers/lib/python3.10/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/ffomezolam/.pyenv/versions/3.10.4/envs/transformers/lib/python3.10/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/Users/ffomezolam/.pyenv/versions/3.10.4/envs/transformers/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1440, in train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/Users/ffomezolam/.pyenv/versions/3.10.4/envs/transformers/lib/python3.10/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 576, in minimize
        grads_and_vars = self._compute_gradients(
    File "/Users/ffomezolam/.pyenv/versions/3.10.4/envs/transformers/lib/python3.10/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 634, in _compute_gradients
        grads_and_vars = self._get_gradients(
    File "/Users/ffomezolam/.pyenv/versions/3.10.4/envs/transformers/lib/python3.10/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 510, in _get_gradients
        grads = tape.gradient(loss, var_list, grad_loss)

    TypeError: Argument `target` should be a list or nested structure of Tensors, Variables or CompositeTensors to be differentiated, but received None.
