## データのダウンロード

In [None]:
import requests


def download_file(url, save_path):
    with open(save_path, "wb") as file:
        response = requests.get(url)
        file.write(response.content)


path = "arxiv.jsonl"
download_url = "https://data.together.xyz/redpajama-data-1T/v1.0.0/arxiv/arxiv_023827cd-7ee8-42e6-aa7b-661731f4c70f.jsonl"
download_file(download_url, path)

In [None]:
# データの確認
import json

path = "arxiv.jsonl"
with open(path, "r") as f:
    d = json.loads(f.readline())

print(d.keys())

In [None]:
print(d["meta"])

In [None]:
# データの読み込み
import json


with open(path, "r") as f:
    texts = [json.loads(line)["text"] for line in f]

## データの前処理

In [None]:
from langchain.text_splitter import SpacyTextSplitter


def text_splitter(document: str, max_length: int = 512) -> list[str]:
    text_splitter = SpacyTextSplitter(separator="[SEP]")
    docs = text_splitter.split_text(document.replace("\n", ""))

    chunks = []
    chunk = ""
    if len(docs) == 0:
        return []
    for text in docs[0].split("[SEP]"):
        if len(chunk) + len(text) > max_length:
            chunks.append(chunk)
            chunk = text
        else:
            chunk += text
    if chunk:
        chunks.append(chunk)
    return chunks

In [None]:
# 並列バージョン
from joblib import Parallel, delayed
from tqdm import tqdm

dataset_texts = Parallel(n_jobs=-1)(
    delayed(text_splitter)(text) for text in tqdm(texts)
)

In [None]:
# save as jsonl with key "text"
import json

with open("chunked_dataset.jsonl", "w") as f:
    for texts in dataset_texts:
        for text in texts:
            json.dump({"text": text}, f, ensure_ascii=False)
            f.write("\n")

In [None]:
from datasets import Dataset

with open("chunked_dataset.jsonl", "r") as f:
    dataset_list = [json.loads(line) for line in f]
dataset = Dataset.from_list(dataset_list)

# トークナイザの学習

In [None]:
from transformers import AutoTokenizer

gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast

tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(
    dataset["text"],
    vocab_size=30_000,
)
my_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<bos>",
    eos_token="<eos>",
    unk_token="<unk>",
    pad_token="<pad>",
)

In [None]:
example = "abstract: This is an example of text. introduction: This is an introduction. conclusion: This is a conclusion."
print(gpt2_tokenizer.tokenize(example))
print(my_tokenizer.tokenize(example))

In [None]:
my_tokenizer.save_pretrained("tokenizer")

In [None]:
from transformers import GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(transformer_tokenizer),
    n_ctx=512,
    bos_token_id=transformer_tokenizer.bos_token_id,
    eos_token_id=transformer_tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)

In [None]:
print(config)
# GPT2Config {
#   "_name_or_path": "gpt2",
#   "activation_function": "gelu_new",
#   "architectures": [
#     "GPT2LMHeadModel"
#   ],
#   "attn_pdrop": 0.1,
#   "bos_token_id": 30000,
#   "embd_pdrop": 0.1,
#   "eos_token_id": 30001,
#   "initializer_range": 0.02,
#   "layer_norm_epsilon": 1e-05,
#   "model_type": "gpt2",
#   "n_ctx": 512,
#   "n_embd": 768,
#   "n_head": 12,
#   "n_inner": null,
#   "n_layer": 12,
#   "n_positions": 1024,
#   "reorder_and_upcast_attn": false,
#   "resid_pdrop": 0.1,
#   "scale_attn_by_inverse_layer_idx": false,
#   "scale_attn_weights": true,
#   "summary_activation": null,
#   "summary_first_dropout": 0.1,
# ...
#   "use_cache": true,
#   "vocab_size": 30004
# }

In [None]:
# トークナイザの読み込み
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

In [None]:
dataset = dataset.map(
    lambda data: tokenizer(data["text"], truncation=True, max_length=512), batched=True
)

In [None]:
from tqdm import tqdm

for data in tqdm(dataset):
    if len(data["input_ids"]) > 512:
        print(len(data["input_ids"]))
        print("Too long text")
        break