## データのダウンロード

In [None]:
import requests


def download_file(url, save_path):
    with open(save_path, "wb") as file:
        response = requests.get(url)
        file.write(response.content)


path = "arxiv.jsonl"
download_url = "https://data.together.xyz/redpajama-data-1T/v1.0.0/arxiv/arxiv_023827cd-7ee8-42e6-aa7b-661731f4c70f.jsonl"
download_file(download_url, path)

In [None]:
# データの確認
import json

path = "arxiv.jsonl"
with open(path, "r") as f:
    d = json.loads(f.readline())

print(d.keys())

In [None]:
print(d["meta"])

In [None]:
# データの読み込み
import json


with open(path, "r") as f:
    texts = [json.loads(line)["text"] for line in f]

## データの前処理

# トークナイザの学習

In [None]:
from transformers import AutoTokenizer

gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast

In [None]:
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast

tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(
    iterator=texts,
    vocab_size=30_000,
)
eos_token = "<|endoftext|>"
my_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
)

In [None]:
example = "abstract: This is an example of text. introduction: This is an introduction. conclusion: This is a conclusion."
print(gpt2_tokenizer.tokenize(example))
print(my_tokenizer.tokenize(example))

In [None]:
my_tokenizer.save_pretrained("tokenizer")

In [None]:
my_tokenizer = AutoTokenizer.from_pretrained("tokenizer")

In [None]:
my_tokenizer(texts[0])

In [None]:
from tqdm import tqdm

all_tokens = []
for text in tqdm(texts):
    tokens = my_tokenizer(text)
    all_tokens += [my_tokenizer.eos_token_id] + tokens["input_ids"]

In [None]:
import pickle

with open("tokens.pkl", "wb") as f:
    pickle.dump(all_tokens, f)

In [None]:
token_length = 512

with open("chunked_tokens.jsonl", "w") as f:
    for i in range(0, len(all_tokens), token_length):
        chunk = all_tokens[i : i + token_length]
        f.write(json.dumps({"input_ids": chunk}) + "\n")

In [None]:
with open("chunked_tokens.jsonl", "r") as f:
    tokens = [json.loads(line) for line in f]

In [None]:
len(tokens[0]["input_ids"])

In [None]:
from transformers import GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(transformer_tokenizer),
    n_ctx=512,
    bos_token_id=transformer_tokenizer.bos_token_id,
    eos_token_id=transformer_tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)

In [None]:
print(config)
# GPT2Config {
#   "_name_or_path": "gpt2",
#   "activation_function": "gelu_new",
#   "architectures": [
#     "GPT2LMHeadModel"
#   ],
#   "attn_pdrop": 0.1,
#   "bos_token_id": 30000,
#   "embd_pdrop": 0.1,
#   "eos_token_id": 30001,
#   "initializer_range": 0.02,
#   "layer_norm_epsilon": 1e-05,
#   "model_type": "gpt2",
#   "n_ctx": 512,
#   "n_embd": 768,
#   "n_head": 12,
#   "n_inner": null,
#   "n_layer": 12,
#   "n_positions": 1024,
#   "reorder_and_upcast_attn": false,
#   "resid_pdrop": 0.1,
#   "scale_attn_by_inverse_layer_idx": false,
#   "scale_attn_weights": true,
#   "summary_activation": null,
#   "summary_first_dropout": 0.1,
# ...
#   "use_cache": true,
#   "vocab_size": 30004
# }

In [None]:
# トークナイザの読み込み
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

In [None]:
dataset = dataset.map(
    lambda data: tokenizer(data["text"], truncation=True, max_length=512), batched=True
)

In [None]:
from tqdm import tqdm

for data in tqdm(dataset):
    if len(data["input_ids"]) > 512:
        print(len(data["input_ids"]))
        print("Too long text")
        break

In [None]:
tokens = 0
for data in tqdm(dataset):
    tokens += len(data["input_ids"])
print(tokens)  # 0.1B

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
dataset

In [None]:
data_collator([dataset["input_ids"][0]])

# 推論

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained("pretrained_model").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

prompt = "Abstract: "
with torch.no_grad():
    token_ids = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    output_ids = model.generate(
        token_ids.to(model.device),
        max_new_tokens=512,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

output = tokenizer.decode(output_ids.tolist()[0], skip_special_tokens=True)
print(output)

In [None]:
tokenizer.eos_token_id