[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1L0RAspJNYSjP6chXPvGprxbiVvSrFXE4?usp=sharing)

# Generating images from text

Based on HuggingFace notebook (https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)

In [None]:
!nvidia-smi

## Generate an image

In [None]:
!pip install diffusers==0.2.4
!pip install transformers scipy ftfy datasets
!pip install "ipywidgets>=7,<8"

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

from huggingface_hub import notebook_login

notebook_login()

In [None]:
import torch
from diffusers import StableDiffusionPipeline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# make sure you're logged in with `huggingface-cli login`
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, use_auth_token=True).to(device) 

In [None]:
from torch import autocast

prompt = "A headshot of a man in his twenties with short dark hair"
with autocast("cuda"):
  image = pipe(prompt)["sample"][0]

image.save(f"astronaut_rides_horse.png")
image

## Keep generating the same image

In [None]:
generator = torch.Generator(device).manual_seed(1)

with autocast('cuda'):
  image = pipe(prompt, generator=generator)['sample'][0]

image

## Generate a grid of images

In [None]:
from PIL import Image

def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

num_cols = 2
num_rows = 2

prompt = [prompt] * num_cols

all_images = []
for i in range(num_rows):
  with autocast("cuda"):
    images = pipe(prompt)["sample"]
  all_images.extend(images)

grid = image_grid(all_images, rows=num_rows, cols=num_cols)
grid

# Tokenizers for Text

## Working with the Hugging Face library

**We want to use the same weights for our model and tokenizer. How can we use the bert uncased checkpoint ('bert-base-uncased') for our tokenizer.**

In [None]:
from transformers import AutoTokenizer
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

**How can we determine how large the vocabulary is?**

In [None]:
print(tokenizer.vocab)
print(f'The vocabulary size is {len(tokenizer.vocab)}')

**Convert the following sentence into**
1. Tokens
2. Numerical IDs

In [None]:
sentence = 'I like NLP'
print(sentence)
tokens = tokenizer.tokenize(sentence)
print(tokens)
ids = tokenizer.encode(sentence)
print(ids)
print(tokenizer.decode(ids))


In [None]:
print(f'{tokenizer.cls_token} -> {tokenizer.cls_token_id}')
print(f'{tokenizer.sep_token} -> {tokenizer.sep_token_id}')

In [None]:
'😀' in tokenizer.vocab

In [None]:
sentence = 'I like NLP😀'
tokenizer.tokenize(sentence)

In [None]:
first_sentence = 'I like NLP.'
second_sentence = 'What about you?'
input = tokenizer(first_sentence, second_sentence, return_tensors='pt')
input

In [None]:
input['input_ids']

In [None]:
input['token_type_ids']

In [None]:
input['attention_mask']

In [None]:
first_sentence = 'I like NLP.'
second_sentence = 'What are your thoughts on the subject?'
input = tokenizer([first_sentence, second_sentence], padding=True, return_tensors='pt')
input['attention_mask']

# Text classification - IMDB Dataset

## Datasets library

In [None]:
from datasets import list_datasets
list_datasets()

In [None]:
from datasets import load_dataset

imdb = load_dataset("imdb")
imdb

- Similar to a python dictionary, where each key corresponds to a different split

In [None]:
imdb['train'][0]

In [None]:
imdb['test'][:3]

In [None]:
imdb['train'] = imdb['train'].shuffle(seed=1).select(range(2000))
imdb['train']

In [None]:
imdb_train_validation = imdb['train'].train_test_split(train_size=0.8)
imdb_train_validation

In [None]:
imdb_train_validation['test']

In [None]:
imdb_train_validation['validation'] = imdb_train_validation.pop('test')
imdb_train_validation

In [None]:
imdb.update(imdb_train_validation)
imdb

In [None]:
imdb['test'] = imdb['test'].shuffle(seed=1).select(range(400))
imdb['test']

In [None]:
imdb['unsupervised'][:3]

In [None]:
imdb.pop('unsupervised')
imdb

## Overview of IMDB Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('max_colwidth', 250)

In [None]:
imdb.set_format('pandas')
df = imdb['train'][:]
df.sample(frac=1 ,random_state=1).head(10)

In [None]:
df.loc[0, 'text']

In [None]:
df['text'] = df.text.str.replace('<br />', '')
df.loc[0, 'text']

In [None]:
df.label.value_counts()

In [None]:
df["Words per review"] = df["text"].str.split().apply(len)
df.boxplot("Words per review", by="label", grid=False, showfliers=False,
           color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
# 0 is negative
# 1 is positive
df[df.text.str.len() < 200]

In [None]:
imdb.reset_format()

## Tokenizer

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-cased"
#checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

imdb_encoded = imdb.map(tokenize_function, batched=True, batch_size=None)
imdb_encoded

In [None]:
print(imdb_encoded['train'][0])

## Tiny IMDB

In [None]:
import transformers
import re

[x for x in dir(transformers) if re.search(r'^AutoModel', x)]

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 2
model = (AutoModelForSequenceClassification
         .from_pretrained(checkpoint, num_labels=num_labels)
         .to(device))

In [None]:
from datasets import DatasetDict

tiny_imdb = DatasetDict()
tiny_imdb['train'] = imdb['train'].shuffle(seed=1).select(range(50))
tiny_imdb['validation'] = imdb['validation'].shuffle(seed=1).select(range(10))
tiny_imdb['test'] = imdb['test'].shuffle(seed=1).select(range(10))

tiny_imdb_encoded = tiny_imdb.map(tokenize_function, batched=True, batch_size=None)
tiny_imdb_encoded

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(tiny_imdb_encoded["train"]) // batch_size
model_name = f"{checkpoint}-finetuned-tiny-imdb"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  optim='adamw_torch'
                                  )
training_args

In [None]:
from transformers import Trainer

torch.cuda.empty_cache()

trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=tiny_imdb_encoded["train"],
                  eval_dataset=tiny_imdb_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train();

In [None]:
preds = trainer.predict(tiny_imdb_encoded['test'])
preds

In [None]:
preds.predictions.shape

In [None]:
preds.predictions.argmax(axis=-1)

In [None]:
preds.label_ids

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(preds.label_ids, preds.predictions.argmax(axis=-1))

In [None]:
def get_accuracy(preds):
  predictions = preds.predictions.argmax(axis=-1)
  labels = preds.label_ids
  accuracy = accuracy_score(preds.label_ids, preds.predictions.argmax(axis=-1))
  return {'accuracy': accuracy}


In [None]:
from transformers import Trainer

torch.cuda.empty_cache()

trainer = Trainer(model=model, 
                  compute_metrics=get_accuracy,
                  args=training_args, 
                  train_dataset=tiny_imdb_encoded["train"],
                  eval_dataset=tiny_imdb_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train();

## Training run

In [None]:
batch_size = 16
logging_steps = len(imdb_encoded["train"]) // batch_size
model_name = f"{checkpoint}-finetuned-imdb"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  optim='adamw_torch'
                                  )

In [None]:
from transformers import Trainer

torch.cuda.empty_cache()

trainer = Trainer(model=model, 
                  args=training_args, 
                  compute_metrics=get_accuracy,
                  train_dataset=imdb_encoded["train"],
                  eval_dataset=imdb_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train();

In [None]:


trainer.evaluate()

In [None]:
trainer.save_model()

In [None]:
model_name

In [None]:
from transformers import pipeline
classifier = pipeline('text-classification', model=model_name)
classifier('This is not my idea of fun')

In [None]:
classifier('This was beyond incredible')

# Vision Transformers

## Exploratory Data Analysis

### Datasets
The CIFAR-10 dataset is a well-known image dataset

(If you are not familiar with the datasets library, use this tutorial to help you: https://huggingface.co/docs/datasets/tutorial)

In [None]:
# Load the dataset
# HINT: Try help(datasets.load_dataset)
import datasets

cifar = ... # complete this

#### Hint


Try help(datasets.load_dataset)

### View sample images

Datasets are very similar in structure to python dictionaries.


In [None]:
#Display the first 5 images

#### Hint


Try help(display)

### Questions on dataset

- How many images are there in the dataset?
- What information is available with each sample of the dataset?
- How many train samples do we have?
- How many validation and test samples do we have?
- How many labels are there for this dataset? What are they?

In [None]:
# Create a list, labels, that contains all the different label names
labels = ... # Complete this

#### Hint:


In [None]:
cifar['train'].features

### Reducing the size of the dataset

Reduce the size of the dataset so that we have:
- train - 5000
- test - 500

In [None]:
#cifar['train'] = ...
#cifar['test'] = ...

Now create a validation dataset, using 20% from the train dataset

### Problem: Create a mapping between labels and IDs

Each of the numeric labels has an associated text description.

In [None]:
# Create a mapping between the numeric label and the text description
id2label = ... # Complete this

In [None]:
# Now do the reverse, create a mapping between the text description and the numeric label
label2id = ... # Complete this

#### Hint:

Consider using a dictionary comprehension

### Pre-processing images

When working with text data, we need to tokenize the text so that they are in a numerical form that a model can work with. A feature extractor is similar to a tokenizer, but we are not concerned with text. 

- What are some of the pre-processing steps you might be interested in, when working with images?

In [None]:
#Select a relevant feature extractor
checkpoint = 'google/vit-base-patch16-224'

feature_extractor = ... # complete this

In [None]:
import torchvision

from torchvision.transforms import (
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    ToTensor,
    Resize,
    CenterCrop
)


In the next section we do a couple of transformations to the images, such as randomly resize the, a horizontal flip etc.
- Why do we do these?

In [None]:
# Perform image normalization
normalize = ... # Complete this

In [None]:
train_transform = Compose(
    [
     RandomResizedCrop(feature_extractor.size),
     RandomHorizontalFlip(),
     ToTensor(),
     normalize
    ]
)

validation_transform = Compose(
        [
            Resize(feature_extractor.size),
            CenterCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

def train_transform_images(images):
  images["pixel_values"] = [train_transform(image.convert("RGB")) for image in images["img"]]
  return images

def validation_transform_images(images):
  images["pixel_values"] = [validation_transform(image.convert("RGB")) for image in images["img"]]
  return images

In [None]:
#transformed_cifar['train'].set_transform(train_transform_images)
#transformed_cifar['validation'].set_transform(validation_transform_images)
#transformed_cifar['test'].set_transform(validation_transform_images)

In [None]:
transformed_cifar = cifar.with_transform(train_transform_images)
transformed_cifar['train'] = cifar['train'].with_transform(train_transform_images)
transformed_cifar['validation'] = cifar['validation'].with_transform(validation_transform_images)
transformed_cifar['test'] = cifar['test'].with_transform(validation_transform_images)

In [None]:
transformed_cifar['train'][:2]

### A transformed image

In [None]:
# Display a sample image from your training dataset
sample_image = ...

In [None]:
# Now apply the transformations and show what this transformed image looks like
transformed_sample_image = ...

In [None]:
transformed_sample_image.shape

In [None]:
sample_image = cifar['train'][0]['img']
sample_image

### Getting images in the right format

**4-images**

In [None]:
four_images = [transformed_cifar['train'][i] for i in range(4)]
four_images

In [None]:
print(four_images[0]['pixel_values'].shape, four_images[1]['pixel_values'].shape, four_images[2]['pixel_values'].shape, four_images[3]['pixel_values'].shape)

In [None]:
four_images_labels = [image['label'] for image in four_images]
four_images_labels

- Now we know these need to be converted to tensors

In [None]:
#Convert these images to tensors
#four_images_labels = ...

- Now we try and do the same for the pixel_values

In [None]:
four_images_pixel_values = torch.tensor([image['pixel_values'] for image in four_images])
four_images_pixel_values

In [None]:
four_images_pixel_values = torch.cat([image['pixel_values'] for image in four_images])
four_images_pixel_values

In [None]:
four_images_pixel_values.shape

In [None]:
# Now get the four_images_pixel_values in the right format
four_images_pixel_values = ...

Let's put this all together in a collate function
- Why do we have 'labels' and not 'label' ?

In [None]:
from torch.utils.data import DataLoader

def collate_fn(images):
  labels = torch.tensor([image['label'] for image in images])
  pixel_values = torch.stack([image['pixel_values'] for image in images])
  return {'pixel_values': pixel_values, 'labels': labels}

train_dataloader = DataLoader(transformed_cifar['train'], batch_size=4, collate_fn=collate_fn, shuffle=True)
validation_dataloader = DataLoader(transformed_cifar['validation'], batch_size=4, collate_fn=collate_fn, shuffle=False)
test_dataloader = DataLoader(transformed_cifar['test'], batch_size=4, collate_fn=collate_fn, shuffle=False)

In [None]:
batch = next(iter(train_dataloader))

for key, value in batch.items():
  print(key, value.shape)

### Using a pre-trained model

When looking at text classification, we used a BERT model which is very similar to a Vision Transformer model. 

In [None]:
#Select a relevant model
checkpoint = 'google/vit-base-patch16-224'

model = ... # complete this

## Training arguments

In [None]:
from transformers import TrainingArguments, Trainer

batch_size=32
metric_name = "accuracy"

args = TrainingArguments(
    f"cifar-10",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
    remove_unused_columns=False,
)

## Compute metrics - accuracy

In [None]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

## Model training

In [None]:
import torch

model_name = f"{checkpoint}-finetuned-cifar10"
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=transformed_cifar['train'],
    eval_dataset=transformed_cifar['validation'],
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor
)

trainer.evaluate()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/

In [None]:
trainer.train()

In [None]:
trainer.save_model()

## Inference

In [None]:
c = load_dataset('cifar10')
c

In [None]:
test_image = c['test'][-1]['img']
test_image