# TAHLR Week 13: A Tour of Transformer Applications

Code notebook for TAHLR course at ISAW (Fall 2023) based on Tunstall et al. 2022 (Transformers) Ch. 1: Hello Transformers! and Ch. 2: Text Classification

In [None]:
# # Installs
# !pip install "transformers[sentencepiece]" datasets

# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## A Tour of Transformer Applications

In [None]:
text = """Dear Amazon, last week I ordered an Optimus Prime action figure from your online store in Germany. Unfortunately, when I opened the package, I discovered to my horror that I had been sent an action figure of Megatron instead! As a lifelong enemy of the Decepticons, I hope you can understand my dilemma. To resolve the issue, I demand an exchange of Megatron for the Optimus Prime figure I ordered. Enclosed are copies of my records concerning this purchase. I expect to hear from you soon. Sincerely, Bumblebee."""

In [None]:
# classification

from transformers import pipeline

classifier = pipeline("text-classification")


In [None]:
# NB: "By default, the text-classification pipeline uses a model that’s designed for sentiment analysis"

outputs = classifier(text)
pd.DataFrame(outputs)

In [None]:
# ner tagging

ner_tagger = pipeline("ner",aggregation_strategy="simple")
outputs = ner_tagger(text)
pd.DataFrame(outputs)

In [None]:
# (extractive) question answering

reader = pipeline("question-answering")
question = "What does the customer want?"
outputs = reader(question=question, context=text)
pd.DataFrame([outputs])


In [None]:
# summarization

summarizer = pipeline("summarization")
outputs = summarizer(text, max_length=45, clean_up_tokenization_spaces=True)
print(outputs[0]['summary_text'])

In [None]:
# translation

translator = pipeline("translation_en_to_de",
                      model="Helsinki-NLP/opus-mt-en-de")
outputs = translator(text, clean_up_tokenization_spaces=True, min_length=100)
print(outputs[0]['translation_text'])

In [None]:
# text generation

generator = pipeline("text-generation")
response = "Dear Bumblebee, I am sorry to hear that your order was mixed up."
prompt = text + "\n\nCustomer service response:\n" + response
outputs = generator(prompt, max_length=200)
print(outputs[0]['generated_text'])

## More text classification

In [None]:
# Load dataset

from datasets import load_dataset

emotions = load_dataset("emotion")

In [None]:
# observe data

train_ds = emotions["train"]
print(len(train_ds))
print(train_ds[0])

In [None]:
# add to dataframe

emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

In [None]:
# add labels

def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_int2str)
df.head()


In [None]:
# look at class balance

df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
# look at text length

df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False,
          showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
# reset format

emotions.reset_format()

In [None]:
# look at tokenization

from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
# example sentence

emotions['train'][0]

In [None]:
# example tokenization, by id

encoded_text = tokenizer(emotions['train'][0]['text'])
print(encoded_text)

In [None]:
# example tokenization, by id value

tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
# example reconstructed text

print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
# helper function

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
# map to dataset

emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
# get transformer model

import torch
from transformers import AutoModel

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
# set format

emotions_encoded.set_format("torch",
                            columns=["input_ids", "attention_mask", "label"])

In [None]:
# extract hidden states
# NB: distilbert is a masked-language model; we want to get the last hidden state and replace the "head" (i.e. the final layer) with a six-class classifier

def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

In [None]:
# get train/val split

X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, X_valid.shape

In [None]:
# Visualize the hidden states in 2 dimensions with UMAP; plot

from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

# Scale features to [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
# Create a DataFrame of 2D embeddings
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train

fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = emotions["train"].features["label"].names

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"label == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
# train classifier on hidden states

from sklearn.linear_model import LogisticRegression

# We increase `max_iter` to guarantee convergence
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid)

In [None]:
# compare with dummy classifier

from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

In [None]:
# plot confusion matrix

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)