# GPT2 Language Model Training

In this notebook, we will train our model on sample data to estimate its runnig time.


In [1]:
import os
import wandb
import torch
import random
import numpy as np
import pandas as pd
from rich import print
from pathlib import Path
from tqdm.auto import tqdm
from dotenv import load_dotenv
from collections import defaultdict
from huggingface_hub import login

In [2]:
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
import transformers
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
sns.set_theme(context='notebook', style='dark')
tqdm.pandas()

In [5]:
# fix random seed
seed = 42

random.seed(seed)

np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

transformers.set_seed(seed)

In [6]:
dataset_path = Path('../data/sentences.txt')
tokenizer_pah = Path('../models/nano-gpt-tokenizer/')
model_ckpt = 'openai-community/gpt2'
model_name = 'arabic-nano-gpt'
model_path = Path(f'../models/{model_name}')

In [7]:
print(model_path)

In [8]:
# wandb configs
PROJECT_NAME = 'Arabic-Nano-GPT'
JOB_TYPE = 'LM-Modeling'
TAGS = ['MOdeling', 'Transformers', 'GPT2', 'Language-Modeling', 'Arabic']
NOTES = 'LM Training on Arabic Data using GPT2 Model Architecture'
RUN_NAME = 'Dummy-data-and-model-LM-Moldeing'
config = defaultdict(dict)

In [9]:
load_dotenv()
wandb.login()
login(token=os.getenv('HF_TOKEN'),
      add_to_git_credential=True, write_permission=True)  # HF

[34m[1mwandb[0m: Currently logged in as: [33me_hossam96[0m. Use [1m`wandb login --relogin`[0m to force relogin


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/hossam/.cache/huggingface/token
Login successful


In [10]:
# load the dataset
with open(dataset_path, 'r') as f:
    dataset = f.readlines()[:100_000]

In [11]:
train, valid = train_test_split(dataset, train_size=0.8, random_state=seed)
valid, test = train_test_split(valid, train_size=0.5, random_state=seed)

In [12]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_dict({'text': train})
dataset['valid'] = Dataset.from_dict({'text': valid})
dataset['test'] = Dataset.from_dict({'text': test})
print(dataset)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_pah)
print(tokenizer)

In [14]:
def tokenize(batch):
    outputs = tokenizer(
        batch['text'],
        truncation=True,
        return_overflowing_tokens=True,
    )
    return {'input_ids': outputs['input_ids']}


tokenized_dataset = dataset.map(
    tokenize, batched=True, remove_columns=dataset['train'].column_names
)
print(tokenized_dataset)



Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [15]:
tokenizer.decode(tokenized_dataset['train'][0]['input_ids'])

'الحضارات القديمة في أمريكا الشمالية\n'

In [16]:
model_config = AutoConfig.from_pretrained(
    model_ckpt, vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_positions=tokenizer.model_max_length,
    n_ctx=tokenizer.model_max_length,
    n_embd=128, n_head=2, n_layer=2,
)
print(model_config)

In [17]:
model = AutoModelForCausalLM.from_config(model_config)
print(model)

In [18]:
print('Size in MBs:', model.get_memory_footprint() / 1_000_000)

In [19]:
print('Num Params:', model.num_parameters() / 1_000_000, 'M')

In [20]:
num_epochs = 5
batch_size = 16 * 32
total_steps = len(tokenized_dataset['train']) * num_epochs // batch_size
total_steps

781

In [21]:
training_args = TrainingArguments(
    output_dir=model_path,
    run_name=RUN_NAME,
    report_to='wandb',
    save_strategy='no',
    eval_strategy='steps',
    gradient_accumulation_steps=32,
    overwrite_output_dir=True,
    data_seed=seed, seed=seed,
    learning_rate=1e-3,
    weight_decay=0.001,
    warmup_ratio=0.0,
    eval_steps=500,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

In [22]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [24]:
run = wandb.init(project=PROJECT_NAME, job_type=JOB_TYPE,
                 name=RUN_NAME, notes=NOTES, tags=TAGS, config=config)

In [25]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=20, training_loss=7.696140289306641, metrics={'train_runtime': 1990.4362, 'train_samples_per_second': 201.102, 'train_steps_per_second': 0.01, 'total_flos': 86382433075200.0, 'train_loss': 7.696140289306641, 'epoch': 4.076433121019108})

In [26]:
trainer.evaluate(tokenized_dataset['test'], metric_key_prefix='test')

{'test_loss': 7.391066551208496,
 'test_runtime': 7.5792,
 'test_samples_per_second': 1319.661,
 'test_steps_per_second': 2.639,
 'epoch': 4.076433121019108}

In [27]:
wandb.finish()

VBox(children=(Label(value='0.003 MB of 0.030 MB uploaded\r'), FloatProgress(value=0.09298975672215108, max=1.…

0,1
test/loss,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
test/loss,7.39107
test/runtime,7.5792
test/samples_per_second,1319.661
test/steps_per_second,2.639
total_flos,86382433075200.0
train/epoch,4.07643
train/global_step,20.0
train_loss,7.69614
train_runtime,1990.4362
train_samples_per_second,201.102


In [28]:
trainer.save_model(model_path)

In [29]:
_ = trainer.push_to_hub()

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.75M [00:00<?, ?B/s]