# GPT2 Language Model Training

In this notebook, we will train our model on sample data to estimate its runnig time.


In [1]:
import os
import wandb
import torch
import random
import numpy as np
import pandas as pd
from rich import print
from pathlib import Path
from tqdm.auto import tqdm
from dotenv import load_dotenv
from collections import defaultdict

In [2]:
from datasets import load_dataset
import transformers
import huggingface_hub
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
sns.set_theme(context="notebook", style="dark")
tqdm.pandas()

In [5]:
# fix random seed
seed = 42

random.seed(seed)

np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

transformers.set_seed(seed)

In [6]:
dataset_path = Path("../data/wikipeda.csv")
tokenizer_pah = Path("../models/nano-gpt-tokenizer/")
model_ckpt = "openai-community/gpt2"
model_name = "arabic-nano-gpt"
model_path = Path(f"../models/{model_name}")

In [7]:
print(model_path)

In [8]:
# wandb configs
PROJECT_NAME = "Arabic-Nano-GPT"
JOB_TYPE = "LM-Modeling"
TAGS = ["MOdeling", "Transformers", "GPT2", "Language-Modeling", "Arabic"]
NOTES = "LM Training on Arabic Data using Nano GPT2 Model Architecture"
RUN_NAME = "Arabic-NanoGPT-LM-on-200K-Docs"
config = defaultdict(dict)

In [9]:
load_dotenv()
wandb.login()
huggingface_hub.login(
    token=os.getenv("HF_TOKEN"), add_to_git_credential=True, write_permission=True
)  # HF

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33me_hossam96[0m. Use [1m`wandb login --relogin`[0m to force relogin
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [10]:
# load the dataset
dataset = load_dataset("csv", data_files="../data/wikipeda.csv", split="train[:200000]")
print(dataset)

In [11]:
dataset = dataset.train_test_split(train_size=0.9, seed=seed)
print(dataset)

In [12]:
test_data = dataset["test"].train_test_split(test_size=0.5, seed=seed)
dataset["valid"] = test_data["train"]
dataset["test"] = test_data["test"]
del test_data
print(dataset)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_pah)
print(tokenizer)

In [14]:
def tokenize(batch):
    outputs = tokenizer(
        batch["text"],
        truncation=True,
        return_overflowing_tokens=True,
    )
    return {"input_ids": outputs["input_ids"]}


tokenized_dataset = dataset.map(
    tokenize, batched=True, remove_columns=dataset["train"].column_names
)
print(tokenized_dataset)



Map:   0%|          | 0/180000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [15]:
tokenizer.decode(tokenized_dataset["train"][0]["input_ids"])

'تهدف العولمة في المجال الاجتماعي إلى تنميط البشر في جميع البلاد وفقا لمعايير الغرب ، وقد تبدى هذا بوضوح في مؤتمرات الأمم المتحدة للسكان والمرأة وحقوق الإنسان وكشفت حرارة المناقشات عن مدى شراسة الهجمة الغربية الهادفة إلى تغليب معايير الثقافة الغربية والحضارة الغربية وتعميمها في العالم كله ، كما كان لتلاقي وجهة نظر الفاتيكان مع وجهات النظر الإسلامية دلالتها من حيث أن المستهدف هو القيم الدينية الإسلامية ، ومثل هذا الموقف منعطفا جديدا في الأوساط الدولية مغايرا للاتجاه الذي تبنته منظمة اليونسكو في عهد مديرها السابق والهادف إلى المحافظة على تباين الثقافات والحضارات ، كما كشف الموقف عن أن ممثلي الدول الإسلامية في المنظمات الدولية والذين شاركوا في اعداد أوراق عمل هذه المؤتمرات أو كان مفروضا أن يشاركوا لم يكونوا في الواقع يمثلون ثقافة وقيم بلدانهم بقدر تعبيرهم عن الثقافة والقيم الغربية التي تربوا عليها في المعاهد والجامعات الغربية . '

In [16]:
model_config = AutoConfig.from_pretrained(
    model_ckpt,
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_positions=tokenizer.model_max_length,
    n_ctx=tokenizer.model_max_length,
    n_embd=256,
    n_head=4,
    n_layer=4,
)
print(model_config)

In [17]:
model = AutoModelForCausalLM.from_config(model_config)
print(model)

In [18]:
print("Size in MBs:", model.get_memory_footprint() / 1_000_000)

In [19]:
print("Num Params:", model.num_parameters() / 1_000_000, "M")

In [21]:
num_epochs = 2
accum_steps = 8
batch_size = 32
total_steps = len(tokenized_dataset["train"]) * num_epochs // (batch_size * accum_steps)
total_steps, batch_size * accum_steps

(1406, 256)

In [22]:
training_args = TrainingArguments(
    output_dir=model_path,
    run_name=RUN_NAME,
    report_to="wandb",
    save_strategy="no",
    eval_strategy="steps",
    gradient_accumulation_steps=accum_steps,
    overwrite_output_dir=True,
    data_seed=seed,
    seed=seed,
    learning_rate=1e-3,
    weight_decay=0.0,
    warmup_ratio=0.0,
    eval_steps=100,
    logging_steps=100,
    log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

In [23]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [25]:
run = wandb.init(
    project=PROJECT_NAME,
    job_type=JOB_TYPE,
    name=RUN_NAME,
    notes=NOTES,
    tags=TAGS,
    config=config,
)

In [26]:
_ = trainer.train()

Step,Training Loss,Validation Loss
100,7.3768,6.746365
200,6.5028,6.250431
300,6.0865,5.853433
400,5.7563,5.549063
500,5.5138,5.347585
600,5.3615,5.203536
700,5.2481,5.096543
800,5.1406,5.01175
900,5.0665,4.946748
1000,5.015,4.889956


In [27]:
print(trainer.evaluate(tokenized_dataset["test"], metric_key_prefix="test"))

In [28]:
wandb.finish()

VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▆▅▄▃▃▂▂▂▁▁▁▁▁
eval/runtime,▄▄▃▁▅▅▇▆▇▆▇███
eval/samples_per_second,▅▅▅█▄▃▂▃▂▃▂▁▁▁
eval/steps_per_second,▅▅▆█▄▄▂▃▂▃▂▁▁▁
test/loss,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████

0,1
eval/loss,4.77426
eval/runtime,128.7239
eval/samples_per_second,77.686
eval/steps_per_second,2.432
test/loss,4.78784
test/runtime,135.3784
test/samples_per_second,73.867
test/steps_per_second,2.312
total_flos,2170906176079872.0
train/epoch,1.99929


In [29]:
trainer.save_model(model_path)

In [30]:
_ = trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/22.1M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]