# Tokenizers, LLMs and heads

In [21]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer, AutoModel, AutoModelForCausalLM)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
import pandas as pd

## Tokenizers

Let's use the gpt2 model as an example

In [23]:
# Load the gpt2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load the gpt2 model with the text generation head
model_with_task_head = AutoModelForCausalLM.from_pretrained("gpt2")

### Try out the loaded tokenizer

In [31]:
# Encoding can be done with encode method or via calling the tokenizer callable
input_text = "Hello, this is my test sentence."
encoded_input = tokenizer.encode(input_text)

print(encoded_input)
print(tokenizer(input_text))

# Decoding can be done with the decode method
print(tokenizer.decode(encoded_input))

[15496, 11, 428, 318, 616, 1332, 6827, 13]
{'input_ids': [15496, 11, 428, 318, 616, 1332, 6827, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
Hello, this is my test sentence.


### Try out the loaded model

In [34]:
input_text = "Hello, this is my test sentence and I want to continue it with"

# Inference can be done by calling .generate method of the model
model_output = model_with_task_head.generate(**tokenizer(input_text, return_tensors="pt"), max_new_tokens=20)
print("Model output: ", tokenizer.decode(model_output[0]))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Model output:  Hello, this is my test sentence and I want to continue it with a little bit of explanation.

The first thing you need to do is to create a new


### TODO
Try out the gpt2 model. What do you think about the quality of its output?

### Try out the different gpt2 based models

In [39]:
# Load the gpt2 base model without task specific head
base_model = AutoModel.from_pretrained("gpt2")

# Try calling the model. Note: We call the model directly with the input ids instead of the .generate method now.
input_text = "Hello, this is my test sentence and I want to continue it with"
base_model_output = base_model(**tokenizer(input_text, return_tensors="pt"))

base_model_output

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-9.3390e-06, -1.4021e-01, -2.0845e-01,  ..., -1.5329e-01,
          -6.7827e-02, -1.9630e-01],
         [ 4.1949e-01,  2.3525e-01,  3.4816e-01,  ...,  4.5321e-02,
           1.5447e-01,  1.9547e-02],
         [ 4.8471e-01,  2.5748e-01,  1.2024e-01,  ...,  9.0498e-02,
           2.3781e-01,  2.8205e-01],
         ...,
         [ 2.7855e-02, -4.1928e-01, -1.3139e+00,  ..., -1.3534e-01,
           3.8286e-01,  1.1665e-01],
         [ 4.2986e-02, -1.5889e-01, -2.0881e+00,  ...,  1.5083e-01,
           1.3882e-01,  1.7159e-01],
         [ 2.0086e-01, -7.9151e-02, -1.4034e+00,  ...,  1.0712e-01,
           2.7828e-01, -3.2375e-01]]], grad_fn=<ViewBackward0>), past_key_values=((tensor([[[[-1.2526,  2.3200,  0.1722,  ..., -1.0076, -0.1897,  1.3219],
          [-1.6482,  3.0222,  1.2789,  ..., -0.9078, -1.7395,  2.4237],
          [-2.0344,  2.3477,  1.8740,  ..., -1.2935, -1.8629,  2.1281],
          ...,
          [-2.2450,

### TODO
Explain to your pair the difference between the above `model_with_task_head` and `base_model`.

### TODO exploration
* Try out different tokenizer. Do you notice some differences? You can try for instance "gpt2", "distilbert-base-uncased" or "facebook/galactica-1.3b" and convert text to tokens and tokens back to text.
* How do you find gpt2 based models that are fine-tuned to different tasks? Try some of those out.