# Installation
```bash
# create an environment
conda create --name lec4 python=3.9
conda activate lec4
# install pytorch. This one can use GPU acceleration on mac
conda install pytorch -c pytorch-nightly 
# install jupyter
conda install -n lec4 ipykernel --update-deps --force-reinstall
conda install -c anaconda jupyter
# installing huggingface libraries
conda install transformers
conda install datasets
```

# Walkthrough

In [1]:
from datasets import load_dataset
dataset = load_dataset("squad")
dataset

Reusing dataset squad (C:\Users\cjj90\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [2]:
def add_end_of_text(example):
    example['question'] =  example['question'] + '<|endoftext|>'
    return example

dataset = dataset.remove_columns(['id', 'title', 'context', 'answers'])
dataset = dataset.map(add_end_of_text)



  0%|          | 0/87599 [00:00<?, ?ex/s]

  0%|          | 0/10570 [00:00<?, ?ex/s]

In [3]:
dataset['train']['question'][:10]

['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?<|endoftext|>',
 'What is in front of the Notre Dame Main Building?<|endoftext|>',
 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?<|endoftext|>',
 'What is the Grotto at Notre Dame?<|endoftext|>',
 'What sits on top of the Main Building at Notre Dame?<|endoftext|>',
 'When did the Scholastic Magazine of Notre dame begin publishing?<|endoftext|>',
 "How often is Notre Dame's the Juggler published?<|endoftext|>",
 'What is the daily student paper at Notre Dame called?<|endoftext|>',
 'How many student news papers are found at Notre Dame?<|endoftext|>',
 'In what year did the student paper Common Sense begin publication at Notre Dame?<|endoftext|>']

In [4]:
from transformers import AutoTokenizer
model_checkpoint = "distilgpt2" 

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [5]:
sequence = ("This tokenizer is being applied in CS197 at"
            "Harvard.<|endoftext|>")
tokens = tokenizer.tokenize(sequence)
print(tokens)

['This', 'Ġtoken', 'izer', 'Ġis', 'Ġbeing', 'Ġapplied', 'Ġin', 'ĠCS', '197', 'Ġat', 'Har', 'vard', '.', '<|endoftext|>']


In [6]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[1212, 11241, 7509, 318, 852, 5625, 287, 9429, 24991, 379, 13587, 10187, 13, 50256]


In [7]:
sequence = ("This tokenizer is being applied in CS197 at"
            "Harvard.<|endoftext|>")
tokenizer(sequence)

{'input_ids': [1212, 11241, 7509, 318, 852, 5625, 287, 9429, 24991, 379, 13587, 10187, 13, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["question"], truncation=True)

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["question"]
)

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [11]:
block_size = 128

def group_texts(examples):
    # repeat concatenation for input_ids and other keys
    concatenated_examples = {k: sum(examples[k], []) for k in
                            examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size

    # populate each of input_ids and other keys 
    result = {
        k: [t[i : i + block_size] for i in range(0,
            total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # add labels because we'll need it as the output
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
)

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [None]:
print(lm_datasets['train']['input_ids'][0])

In [20]:
tokenizer.decode(lm_datasets['train']['input_ids'][0])

"To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?<|endoftext|>What is in front of the Notre Dame Main Building?<|endoftext|>The Basilica of the Sacred heart at Notre Dame is beside to which structure?<|endoftext|>What is the Grotto at Notre Dame?<|endoftext|>What sits on top of the Main Building at Notre Dame?<|endoftext|>When did the Scholastic Magazine of Notre dame begin publishing?<|endoftext|>How often is Notre Dame's the Juggler published?<|endoftext|>What is the daily student paper at Notre Dame called?<|endoftext|>How many student news papers are found at Notre Dame?<|endoftext|>In what year did the student paper"

In [29]:
tokenizer.decode(lm_datasets['train']['labels'][0])

"To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?<|endoftext|>What is in front of the Notre Dame Main Building?<|endoftext|>The Basilica of the Sacred heart at Notre Dame is beside to which structure?<|endoftext|>What is the Grotto at Notre Dame?<|endoftext|>What sits on top of the Main Building at Notre Dame?<|endoftext|>When did the Scholastic Magazine of Notre dame begin publishing?<|endoftext|>How often is Notre Dame's the Juggler published?<|endoftext|>What is the daily student paper at Notre Dame called?<|endoftext|>How many student news papers are found at Notre Dame?<|endoftext|>In what year did the student paper"

In [None]:
small_train_dataset = \
    lm_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = \
    lm_datasets["validation"].shuffle(seed=42).select(range(100))

In [27]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

In [None]:
training_args = TrainingArguments(
    f"{model_checkpoint}-squad",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

trainer.train()

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
tokenizer.save_pretrained(f"{model_checkpoint}-squad")
model.push_to_hub(f"{model_checkpoint}-squad")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(f"rajpurkar/{model_checkpoint}-squad")
tokenizer = AutoTokenizer.from_pretrained(f"rajpurkar/{model_checkpoint}-squad")


In [28]:
start_text = ("A speedrun is a playthrough of a video game, \
or section of a video game, with the goal of \
completing it as fast as possible. Speedruns \
often follow planned routes, which may incorporate sequence \
breaking, and might exploit glitches that allow sections to \
be skipped or completed more quickly than intended. ")

prompt = "What is the"
inputs = tokenizer(
     start_text + prompt,
     add_special_tokens=False,
     return_tensors="pt")["input_ids"]

prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model.generate(
     inputs,
     max_length=100,
     do_sample=True,
     top_k=50,
     top_p=0.95,
     temperature=0.9,
     num_return_sequences=3)

generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1:]

print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A speedrun is a playthrough of a video game, or section of a video game, with the goal of completing it as fast as possible. Speedruns often follow planned routes, which may incorporate sequence breaking, and might exploit glitches that allow sections to be skipped or completed more quickly than intended. What is the quickest way to do speedruns?





There are two main ways to do speedruns:
Start with a set amount of speedrun and run to the fastest possible
