In [1]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-1b")
model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-1b")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py

6288.0

In [2]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-70m")
model.cuda()
None

(…)a-70m/resolve/main/tokenizer_config.json: 100%|██████████| 396/396 [00:00<00:00, 2.62MB/s]


(…)I/pythia-70m/resolve/main/tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 3.75MB/s]
(…)70m/resolve/main/special_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<00:00, 738kB/s]
(…)erAI/pythia-70m/resolve/main/config.json: 100%|██████████| 567/567 [00:00<00:00, 2.04MB/s]
model.safetensors: 100%|██████████| 166M/166M [00:05<00:00, 30.3MB/s] 


In [3]:
sample_text = "Hello world"

sample_text = tokenizer(sample_text, return_tensors="pt").to("cuda")
result = model(**sample_text)

In [4]:
result.keys()

odict_keys(['logits', 'past_key_values'])

In [5]:
model.config

GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-70m",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.31.0",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

In [5]:
len(result["past_key_values"])

24

In [12]:
import torch
from tqdm import tqdm
from datasets import load_dataset


def measure_ppl(model, tokenizer):
    test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

    max_length = model.config.max_position_embeddings
    stride = 512
    encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
    seq_len = encodings.input_ids.size(1)
    device = "cuda"

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    print(ppl)


# measure_ppl(
#     model=model,
#     tokenizer=tokenizer,
# )

### Copy Model Weights during training

In [23]:
measure_ppl(
    model=model,
    tokenizer=tokenizer,
)

 99%|█████████▉| 560/564 [00:13<00:00, 40.73it/s]

tensor(39.4273, device='cuda:0')





In [1]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", )
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [26]:
from transformers import AutoConfig

model = GPTNeoXForCausalLM._from_config(
    config=AutoConfig.from_pretrained("EleutherAI/pythia-70m")
)
model.cuda()
measure_ppl(
    model=model,
    tokenizer=tokenizer,
)

 99%|█████████▉| 560/564 [00:13<00:00, 40.87it/s]


tensor(56522.4023, device='cuda:0')


In [39]:
import torch

optimizer = torch.optim.Adam(params=model.parameters())

max_length = model.config.max_position_embeddings
stride = 512
encodings = tokenizer("\n\n".join(dataset["text"]), return_tensors="pt")
seq_len = encodings.input_ids.size(1)
device = "cuda"

nlls = []
prev_end_loc = 0
step = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    step += 1
    optimizer.zero_grad()
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    # with torch.no_grad():
    outputs = model(input_ids, labels=target_ids)

    # loss is calculated using CrossEntropyLoss which averages over valid labels
    # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
    # to the left by 1.
    neg_log_likelihood = outputs.loss
    neg_log_likelihood.backward()
    optimizer.step()
    nlls.append(neg_log_likelihood)
    if step % 100 == 0:
        mean_ppl = torch.exp(torch.stack(nlls).mean())
        print(mean_ppl)
    

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
print(ppl)

  2%|▏         | 101/4750 [00:08<10:11,  7.61it/s]

tensor(490.4781, device='cuda:0', grad_fn=<ExpBackward0>)


  4%|▍         | 203/4750 [00:15<05:47, 13.08it/s]

tensor(479.2771, device='cuda:0', grad_fn=<ExpBackward0>)


  6%|▋         | 303/4750 [00:23<05:39, 13.10it/s]

tensor(472.1385, device='cuda:0', grad_fn=<ExpBackward0>)


  8%|▊         | 403/4750 [00:31<05:32, 13.07it/s]

tensor(489.5456, device='cuda:0', grad_fn=<ExpBackward0>)


 11%|█         | 503/4750 [00:38<05:23, 13.13it/s]

tensor(480.5203, device='cuda:0', grad_fn=<ExpBackward0>)


 13%|█▎        | 603/4750 [00:46<05:16, 13.11it/s]

tensor(482.3401, device='cuda:0', grad_fn=<ExpBackward0>)


 15%|█▍        | 703/4750 [00:54<05:08, 13.11it/s]

tensor(491.6689, device='cuda:0', grad_fn=<ExpBackward0>)


 17%|█▋        | 803/4750 [01:01<05:00, 13.11it/s]

tensor(497.9970, device='cuda:0', grad_fn=<ExpBackward0>)


 19%|█▉        | 903/4750 [01:09<04:53, 13.12it/s]

tensor(503.1134, device='cuda:0', grad_fn=<ExpBackward0>)


 21%|██        | 1003/4750 [01:16<04:45, 13.12it/s]

tensor(502.0360, device='cuda:0', grad_fn=<ExpBackward0>)


 23%|██▎       | 1103/4750 [01:24<04:37, 13.12it/s]

tensor(509.8473, device='cuda:0', grad_fn=<ExpBackward0>)


 25%|██▌       | 1203/4750 [01:32<04:30, 13.12it/s]

tensor(515.2341, device='cuda:0', grad_fn=<ExpBackward0>)


 27%|██▋       | 1303/4750 [01:39<04:22, 13.12it/s]

tensor(527.5438, device='cuda:0', grad_fn=<ExpBackward0>)


 30%|██▉       | 1403/4750 [01:47<04:15, 13.13it/s]

tensor(528.8744, device='cuda:0', grad_fn=<ExpBackward0>)


 32%|███▏      | 1503/4750 [01:54<04:07, 13.13it/s]

tensor(529.2859, device='cuda:0', grad_fn=<ExpBackward0>)


 34%|███▎      | 1603/4750 [02:02<04:00, 13.07it/s]

tensor(534.5078, device='cuda:0', grad_fn=<ExpBackward0>)


 36%|███▌      | 1703/4750 [02:10<03:52, 13.12it/s]

tensor(535.6690, device='cuda:0', grad_fn=<ExpBackward0>)


 38%|███▊      | 1803/4750 [02:17<03:45, 13.09it/s]

tensor(535.1155, device='cuda:0', grad_fn=<ExpBackward0>)


 38%|███▊      | 1813/4750 [02:18<03:44, 13.08it/s]


KeyboardInterrupt: 

In [35]:
[1, 2, 3, 4, 5, 6][-3:]

[4, 5, 6]