<a href="https://colab.research.google.com/github/zinzin2312/abschlussarbeit/blob/main/distil_gpt2_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[sentencepiece]
!pip install datasets 

from datasets import load_dataset
import transformers

In [None]:
# Classification using GPT-2 as generator and BERT as classficator
from transformers import GPT2Tokenizer, DistilBertTokenizer, TFGPT2LMHeadModel, TFDistilBertForSequenceClassification

# Preparing the pre-trained model
ckpt_gen = "distilgpt2"
ckpt_class = "distilbert-base-uncased"

tokenizer_gen = GPT2Tokenizer.from_pretrained(ckpt_gen)
tokenizer_class = DistilBertTokenizer.from_pretrained(ckpt_class)

In [None]:
# Download models

# Note that to be able to use models out-of-the-box we need a corresponding head for each task
model_gen = TFGPT2LMHeadModel.from_pretrained(ckpt_gen)
model_class = TFDistilBertForSequenceClassification.from_pretrained(
    ckpt_class,
    # Specify number of labels
    num_labels=2
    )

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification 

In [None]:
# model_gen.config

In [None]:
model_class.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [None]:
# sms_spam dataset: https://huggingface.co/datasets/sms_spam
# Structure : sms_spam dict{'label', 'sms'}
# Length: 5574

sms_spam = load_dataset('sms_spam', split='train')
sms = sms_spam['sms']

max_word_nr = 20
limit_size = 20
sms = [s for s in sms if len(s.split(" ")) <= max_word_nr][:limit_size]
sms_labels = sms_spam['label'][:limit_size]

Reusing dataset sms_spam (/root/.cache/huggingface/datasets/sms_spam/plain_text/1.0.0/53f051d3b5f62d99d61792c91acefe4f1577ad3e4c216fb0ad39e30b9f20019c)


In [None]:
def create_batch(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
# Usage
# list(create_batch(range(10, 75), 10))

In [None]:
# Generate text data

# GPT 2 doesn't have pad_token, define pad = eos
tokenizer_gen.pad_token = tokenizer_gen.eos_token
input_gen = tokenizer_gen(sms, padding=True, truncation=True, return_tensors="tf")
print(input_gen)

""" Generate text using different strats
https://huggingface.co/transformers/main_classes/model.html?highlight=generate#transformers.generation_tf_utils.TFGenerationMixin.generate
"""
# max_length = model_gen.config.n_ctx
max_length = 50
num_return_sequences = 3

In [None]:
# Greedy search

greedy_output = model_gen.generate(input_gen['input_ids'], max_length=max_length)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [None]:
# Beam search

beam_outputs = model_gen.generate(
    input_gen['input_ids'],
    max_length=max_length, 
    num_beams=5, 
    # penalizes repetitive n_grams
    # use with caution, might deletes name of City in article etc.
    no_repeat_ngram_size=2,
    # penalizes repetitive words
    # repetition_penalty = 1.5, 
    early_stopping=True
)


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [None]:
import tensorflow as tf
# Changing seed randomizes the results
tf.random.set_seed(5)

In [None]:
# Sampling

# activate sampling and deactivate top_k by setting top_k sampling to 0
# with temp = 0 sampling becomes greedy
sample_output = model_gen.generate(
    input_gen['input_ids'],
    do_sample=True, 
    max_length=max_length, 
    top_k=0,
    temperature=0.7
)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [None]:
# Top-K Sampling

sample_top_k_outputs = model_gen.generate(
    input_gen['input_ids'],
    do_sample=True, 
    max_length=max_length, 
    # set top_k to 50
    top_k=50,
    num_return_sequences=1
)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [None]:
# Top-P Sampling


# deactivate top_k sampling and sample only from 92% most likely words
sample_top_p_outputs = model_gen.generate(
    input_gen['input_ids'],
    do_sample=True, 
    max_length=max_length, 
    top_p=0.92, 
    top_k=0,
    num_return_sequences=1
)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [None]:
# tokenizer_gen.decode(greedy_output[0], skip_special_tokens=True)
# print("Output:\n" + 100 * '-')

# for i, beam_output in enumerate(beam_outputs):
#   print("{}: {}".format(i, tokenizer_gen.decode(beam_output, skip_special_tokens=True)))
# print("Output:\n" + 100 * '-')


# print(tokenizer_gen.decode(sample_output[0], skip_special_tokens=True))
# print("Output:\n" + 100 * '-')


# for i, sample_output in enumerate(sample_top_k_outputs):
#   print("{}: {}".format(i, tokenizer_gen.decode(sample_output, skip_special_tokens=True)))
# print("Output:\n" + 100 * '-')


# for i, so in enumerate(sample_top_p_outputs):
#   print("{}: {}".format(i, tokenizer_gen.decode(so, skip_special_tokens=True)))

outputs = sample_top_p_outputs
syn_sms = []
for output in outputs:
  syn_sms.append(tokenizer_gen.decode(output, skip_special_tokens=True))

In [None]:
data = sms
syn_data = syn_sms

In [None]:
# F1 score metric?
# Doesn't work ????
class F1_metric(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super().__init__(name=name, **kwargs)
        # Initialize our metric by initializing the two metrics it's based on:
        # Precision and Recall
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Update our metric by updating the two metrics it's based on
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def reset_state(self):
        self.precision.reset_state()
        self.recall.reset_state()

    def result(self):
        # To get the F1 result, we compute the harmonic mean of the current
        # precision and recall
        return 2 / ((1 / self.precision.result()) + (1 / self.recall.result())) 


In [None]:
import numpy as np

# Learning rate scheduling
from tensorflow.keras.optimizers.schedules import PolynomialDecay

batch_size = 8
num_epochs = 20
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs
num_train_steps = (len(data) // batch_size) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
    )

# customized optimizer version of Adam
from tensorflow.keras.optimizers import Adam
opt = Adam(learning_rate=lr_scheduler)

# Defining loss funct to be calculated from logits
# Always check if loss funct matches model outputting logits or probs
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# model_class.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Compile the model
model_class.compile(optimizer=opt,
                    loss=loss,
                    # Passing metrics on the fly to observe how the model is doing
                    # has to be passed as list
                    metrics=['accuracy']
                    # Doesn't work???
                    # metrics=['accuracy', F1_metric()]
                    )

In [None]:
# print(syn_sms[0])
# tokenizer_class.convert_ids_to_tokens(batch['input_ids'][0])

In [None]:
# Train the model with OG dataset
og_batch = dict(tokenizer_class(data, padding=True, truncation=True, return_tensors="tf"))
labels = tf.convert_to_tensor(sms_labels)

# model_class.train_on_batch(batch, labels)

model_class.fit(
    og_batch,
    labels,
    validation_data=(og_batch, labels),
    batch_size=batch_size,
    epochs=num_epochs
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f1e1016afd0>

In [None]:
# Making predictions using og data

preds = model_class.predict(og_batch['input_ids'])['logits']
probs = tf.nn.softmax(preds)
class_preds = np.argmax(probs, axis=1)
# print(preds.shape, class_preds.shape)
print(class_preds)
print(labels)

[0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1]
tf.Tensor([0 0 1 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1], shape=(20,), dtype=int32)


In [None]:
# Making predictions using syn data

syn_batch = dict(tokenizer_class(syn_data, padding=True, truncation=True, return_tensors="tf"))

syn_preds = model_class.predict(syn_batch['input_ids'])['logits']
syn_probs = tf.nn.softmax(syn_preds)
syn_class_preds = np.argmax(syn_probs, axis=1)
print(class_preds)
print(labels)

[0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1]
tf.Tensor([0 0 1 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1], shape=(20,), dtype=int32)


In [None]:
# Benchmarking our model using metrics

from datasets import load_metric

metric = load_metric("glue", "mrpc")
metric.compute(predictions=syn_class_preds, references=labels)

{'accuracy': 0.85, 'f1': 0.7692307692307693}