In [12]:
# # Keras NLP / Hugging Face NLP Assignment
#
# **a) Inference with a pretrained classifier**
# **b) Fine-tuning a pretrained backbone (sentiment analysis)**
# **c) Building & training your own Transformer from scratch**
#
# We’ll use both Hugging Face’s 🤗 Transformers and TensorFlow / Keras Hub.

# %%
# ## 0) Install dependencies
!pip install -q tensorflow tensorflow-text keras_nlp transformers datasets

# %%
# ## 1) Inference with a pretrained classifier
# Here we’ll use Hugging Face’s pipeline for sentiment analysis.

from transformers import pipeline

sentiment = pipeline("sentiment-analysis")
examples = [
    "I absolutely loved the new Batman movie!",
    "The service at the restaurant was terrible..."
]
for text in examples:
    print(text, "→", sentiment(text))
# %%
# %%
# ## 2) Fine-tuning a pretrained backbone for sentiment analysis
# We’ll grab a small subset of IMDB from 🤗 Datasets and fine-tune a BERT model in TensorFlow.

import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
# Ensure this cell is run AFTER the cell with !pip install
from datasets import load_dataset
# at the top of your notebook, after pip‐installing:
import tf_keras
from tf_keras.optimizers import Adam

# Define the number of batches to take for training and validation
num_batches_to_take = 10


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


I absolutely loved the new Batman movie! → [{'label': 'POSITIVE', 'score': 0.999868631362915}]
The service at the restaurant was terrible... → [{'label': 'NEGATIVE', 'score': 0.9994834661483765}]


In [13]:
# %%
# ## 2) Fine-tuning a pretrained backbone for sentiment analysis
# We’ll grab a small subset of IMDB from 🤗 Datasets and fine-tune a BERT model in TensorFlow.

import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
# Ensure this cell is run AFTER the cell with !pip install
from datasets import load_dataset

# 2.1 Load data
ds = load_dataset("imdb", split="train[:5%]").shuffle(42)
ds = ds.train_test_split(test_size=0.2)
train_ds = ds["train"]
val_ds   = ds["test"]

# 2.2 Tokenizer & TF Dataset
model_name = "bert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(model_name)

def encode(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(encode, batched=True)
val_ds   = val_ds.map(encode, batched=True)

# columns = ["input_ids", "token_type_ids", "attention_mask", "label"] # No longer needed as is
train_ds.set_format(type="tf", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
val_ds.set_format(type="tf", columns=["input_ids", "token_type_ids", "attention_mask", "label"])

# Modify the generator to yield a tuple of (inputs, labels)
def train_generator():
    for batch in train_ds:
        # Separate features and labels
        inputs = {k: batch[k] for k in ["input_ids", "token_type_ids", "attention_mask"]}
        labels = batch["label"]
        yield (inputs, labels)

def val_generator():
    for batch in val_ds:
        # Separate features and labels
        inputs = {k: batch[k] for k in ["input_ids", "token_type_ids", "attention_mask"]}
        labels = batch["label"]
        yield (inputs, labels)

tf_train = tf.data.Dataset.from_generator(
    train_generator,
    output_signature=(
        {k: tf.TensorSpec(shape=(None,), dtype=tf.int32) for k in ["input_ids", "token_type_ids", "attention_mask"]},
        # Change the shape of the label TensorSpec to () for a scalar
        tf.TensorSpec(shape=(), dtype=tf.int64)
    )
).batch(32)

tf_val = tf.data.Dataset.from_generator(
    val_generator,
    output_signature=(
        {k: tf.TensorSpec(shape=(None,), dtype=tf.int32) for k in ["input_ids", "token_type_ids", "attention_mask"]},
         # Change the shape of the label TensorSpec to () for a scalar
        tf.TensorSpec(shape=(), dtype=tf.int64)
    )
).batch(32)

bert = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Assuming you are using tf_keras from your imports for Adam
# If using tensorflow.keras.optimizers, use tf.keras.optimizers.Adam
from tf_keras.optimizers import Adam
opt  = Adam(learning_rate=2e-5)

bert.compile(
    optimizer=opt,
    loss=tf_keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

# Define the number of batches to take for training and validation
num_batches_to_take = 10

bert.fit(
    tf_train.take(num_batches_to_take),
    validation_data=tf_val.take(num_batches_to_take),
    epochs=2
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2


<tf_keras.src.callbacks.History at 0x7a29e0d5e3d0>

In [14]:
# %%
# ## 3) Build & train your own Transformer from scratch
# Adapted from the Keras example: https://keras.io/examples/nlp/text_classification_with_transformer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# 3.1 Prepare a toy text dataset (here we reuse IMDB)
maxlen = 200
vocab_size = 20000
batch_size = 32

(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val   = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)
val_ds   = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(batch_size)

# 3.2 Transformer block definition
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1   = layers.Dropout(rate)
        self.dropout2   = layers.Dropout(rate)

    # Make the training argument optional with a default value
    def call(self, x, training=False):
        attn_output = self.att(x, x)
        attn_output = self.dropout1(attn_output, training=training)
        out1        = self.layernorm1(x + attn_output)
        ffn_output  = self.ffn(out1)
        ffn_output  = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# 3.3 Build a simple Transformer classifier
embed_dim = 32
num_heads = 2
ff_dim    = 32

inputs    = layers.Input(shape=(maxlen,))
x         = layers.Embedding(vocab_size, embed_dim)(inputs)
# The training argument should now be handled implicitly by Keras
x         = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
x         = layers.GlobalAveragePooling1D()(x)
x         = layers.Dropout(0.1)(x)
x         = layers.Dense(20, activation="relu")(x)
outputs   = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

# 3.4 Train
model.fit(train_ds, validation_data=val_ds, epochs=3)

Epoch 1/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.7255 - loss: 0.5003 - val_accuracy: 0.8704 - val_loss: 0.3009
Epoch 2/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.9194 - loss: 0.2047 - val_accuracy: 0.8645 - val_loss: 0.3302
Epoch 3/3
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.9563 - loss: 0.1271 - val_accuracy: 0.8506 - val_loss: 0.4250


<keras.src.callbacks.history.History at 0x7a28d81a7b90>