# GPT2 Language Model Training

In this notebook, we will train our model on sample data to estimate its runnig time.


In [1]:
import os
import wandb
import torch
import random
import numpy as np
import pandas as pd
from rich import print
from pathlib import Path
from tqdm.auto import tqdm
from dotenv import load_dotenv
from collections import defaultdict

In [2]:
from datasets import load_dataset
import transformers
import huggingface_hub
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
sns.set_theme(context='notebook', style='dark')
tqdm.pandas()

In [5]:
# fix random seed
seed = 42

random.seed(seed)

np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

transformers.set_seed(seed)

In [6]:
dataset_path = Path('../data/wikipeda.csv')
tokenizer_pah = Path('../models/nano-gpt-tokenizer/')
model_ckpt = 'openai-community/gpt2'
model_name = 'arabic-nano-gpt'
model_path = Path(f'../models/{model_name}')

In [7]:
print(model_path)

In [8]:
# wandb configs
PROJECT_NAME = 'Arabic-Nano-GPT'
JOB_TYPE = 'LM-Modeling'
TAGS = ['MOdeling', 'Transformers', 'GPT2', 'Language-Modeling', 'Arabic']
NOTES = 'LM Training on Arabic Data using Nano GPT2 Model Architecture'
RUN_NAME = 'Overfitting-Small-Batch'
config = defaultdict(dict)

In [9]:
load_dotenv()
wandb.login()
huggingface_hub.login(token=os.getenv('HF_TOKEN'),
      add_to_git_credential=True, write_permission=True)  # HF

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33me_hossam96[0m. Use [1m`wandb login --relogin`[0m to force relogin
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [10]:
# load the dataset
dataset = load_dataset(
    'csv', data_files='../data/wikipeda.csv', split='train[:1000]')
print(dataset)

In [11]:
dataset = dataset.train_test_split(train_size=0.9, seed=seed)
print(dataset)

In [12]:
test_data = dataset['test'].train_test_split(test_size=0.5, seed=seed)
dataset['valid'] = test_data['train']
dataset['test'] = test_data['test']
del test_data
print(dataset)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_pah)
print(tokenizer)

In [14]:
def tokenize(batch):
    outputs = tokenizer(
        batch['text'],
        truncation=True,
        return_overflowing_tokens=True,
    )
    return {'input_ids': outputs['input_ids']}


tokenized_dataset = dataset.map(
    tokenize, batched=True, remove_columns=dataset['train'].column_names
)
print(tokenized_dataset)



Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [15]:
tokenizer.decode(tokenized_dataset['train'][0]['input_ids'])

'المجموعات العرقية الأخرى تشكل ما نسبته 2% من الأردنيين. من هؤلاء الشركس الذي الذين نزحوا إلى الأردن بتوجيه من السلطات العثمانية على إثر الحرب الروسية العثمانية. ومع اكتمال هجرتهم أوائل القرن العشرين، بلغ مجموع جميع المهاجرين 3500. تتراوح تقديرات السكان الشركس اليوم بين 20,000 إلى 80,000.'

In [16]:
model_config = AutoConfig.from_pretrained(
    model_ckpt, vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_positions=tokenizer.model_max_length,
    n_ctx=tokenizer.model_max_length,
    n_embd=256, n_head=4, n_layer=4,
)
print(model_config)

In [17]:
model = AutoModelForCausalLM.from_config(model_config)
print(model)

In [18]:
print('Size in MBs:', model.get_memory_footprint() / 1_000_000)

In [19]:
print('Num Params:', model.num_parameters() / 1_000_000, 'M')

In [20]:
num_epochs = 100
accum_steps = 1
batch_size = 32
total_steps = len(tokenized_dataset['train']) * \
    num_epochs // (batch_size * accum_steps)
total_steps, batch_size * accum_steps

(2812, 32)

In [21]:
training_args = TrainingArguments(
    output_dir=model_path,
    run_name=RUN_NAME,
    report_to='wandb',
    save_strategy='no',
    eval_strategy='steps',
    gradient_accumulation_steps=accum_steps,
    overwrite_output_dir=True,
    data_seed=seed, seed=seed,
    learning_rate=5e-4,
    weight_decay=0.0,
    warmup_ratio=0.0,
    eval_steps=100,
    logging_steps=10,
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

In [22]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [24]:
run = wandb.init(project=PROJECT_NAME, job_type=JOB_TYPE,
                 name=RUN_NAME, notes=NOTES, tags=TAGS, config=config)

In [25]:
_ = trainer.train()

Step,Training Loss,Validation Loss
100,7.203,7.307133
200,6.1398,6.548541
300,5.2755,6.30374
400,4.7782,6.254841
500,4.1729,6.304488
600,3.748,6.420715
700,3.3521,6.549696
800,2.9579,6.689791
900,2.7095,6.802957
1000,2.323,6.93329


In [26]:
print(trainer.evaluate(tokenized_dataset['test'], metric_key_prefix='test'))

In [27]:
wandb.finish()

0,1
eval/loss,▅▂▁▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇▇███████
eval/runtime,▃▄█▁▇▆█▄▅▆▅▄▅▄▄▅▃▄▄▅▅▆▇▄▄▃▄▃▃
eval/samples_per_second,▆▅▁█▂▃▁▅▃▃▃▅▄▄▅▄▆▄▅▄▄▃▂▄▅▆▄▆▆
eval/steps_per_second,▆▅▁█▂▃▁▅▃▃▃▅▄▄▅▄▆▄▅▄▄▃▂▄▅▆▄▆▆
test/loss,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇█
train/global_step,▁▁▁▁▁▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇██

0,1
eval/loss,8.17001
eval/runtime,0.1029
eval/samples_per_second,486.13
eval/steps_per_second,19.445
test/loss,8.46069
test/runtime,0.1019
test/samples_per_second,490.873
test/steps_per_second,19.635
total_flos,591311319515136.0
train/epoch,100.0


In [28]:
trainer.save_model(model_path)

In [29]:
_ = trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/22.1M [00:00<?, ?B/s]