# Imports and setups

In [1]:
import transformers
from datasets import load_from_disk
from transformers import GPT2TokenizerFast
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from configuration_sparse_gpt_neo import SparseGPTNeoConfig, SparsityType
from sparse_gpt_neo import SparseGPTNeoForCausalLM
import wandb

In [2]:
transformers.set_seed(123)
with open('wandb_key.txt') as f:
    wandb.login(key = f.read())

wandb: Currently logged in as: y-wu-55 (tiny-transformers). Use `wandb login --relogin` to force relogin
wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\wue09\.netrc


# Data

In [3]:
dataset = load_from_disk("data/TinyStories")
tokenized_dataset = load_from_disk("data/TokenizedTinyStories")

# Model

In [4]:
dim_in = 256
intermediate_factor = 16 # for baseline ffn, intermediate size intermediate factor * input dim
sparsity_type=SparsityType.MOE
num_experts = 4
topk=1
intermediate_size = round( (intermediate_factor * dim_in - num_experts) / num_experts )
print("intermediate layer size is", intermediate_size)

intermediate layer size is 1023


In [5]:
config = SparseGPTNeoConfig(

    # number of tokens in the vocabulary 
    vocab_size = 10_000, 
    # embedding size (vector length) of each token 
    hidden_size=dim_in, 
    # we thus have an embedding block of 512 x 10'000 parameters

    # maximum sequence length, though inputs longer than `hidden_size` will be iteratively processed
    max_position_embeddings = 512, 

    # number of transformer blocks. div by 2 for attention_types
    num_layers=2, 
    # for global and local attention (GPT-Neo-specific)
    attention_types=[[["global", "local"], 1]], 

    num_heads=4,     # attention heads
    window_size=384, # for local attention (GPT-Neo-specific)

    sparsity_type=sparsity_type,
    num_experts=num_experts,
    topk=topk,
    intermediate_size=intermediate_size, # size of 'up-projection' layer in FFN
)

In [6]:
tokenize_function = GPT2TokenizerFast.from_pretrained('10k-tok', model_max_length=config.max_position_embeddings)

assert tokenize_function.model_max_length == config.max_position_embeddings
assert tokenize_function.vocab_size == config.vocab_size

# printing this because of a bug in tokenizers (should be fixed now) https://github.com/huggingface/transformers/issues/26500
print(f'padding token is {tokenize_function.pad_token}')
# HF wasn't saving this nor the tokenizer's pad_token
config.pad_token_id = tokenize_function.pad_token_id

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.


padding token is <pad>


In [7]:
model = SparseGPTNeoForCausalLM(config=config)

print(f'The model has {model.num_parameters():,} parameters.')

The model has 7,422,968 parameters.


# Trainer

In [8]:
assert len(tokenized_dataset['train'][0]['input_ids']) == config.max_position_embeddings
tokenized_dataset['train'][0]['input_ids'][-10:] 

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [9]:
train_dataset, eval_dataset = tokenized_dataset['train'], tokenized_dataset['validation']

batch_size = 16 # TinyStories claims 80, but I am training locally on my poor M1 Air
num_train_epochs = 2  # TinyStories doesn't mention
gradient_accumulation_steps = 16 # TinyStories claims 16

lr = 5e-4 # TinyStories claims 5e-4, higher values are preferable for smaller models

_train_steps = len(train_dataset) // (batch_size * gradient_accumulation_steps)
eval_steps = _train_steps // 10 # evaluate every 10% of training steps

model_name = f'{model.num_parameters()//1e6:.1f}M-{config.num_layers}L-{config.num_heads}H-{config.hidden_size}C-{config.intermediate_size}I'

In [12]:
training_args = TrainingArguments(

    seed       = 123,
    use_cpu    = False, # use GPU if available (not necessarily faster on laptops, but Apple's MPS have good support)
    output_dir = f'./results/models/{model_name}',

    # NOTE: training params
    learning_rate    = lr,
    num_train_epochs = num_train_epochs,
    # Use a smaller batch size to fit into GPU RAM. 
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size  = batch_size,
    # You should aim to have the same amount of samples per acc step, in all of your experiments!
    # so, if you increase batch_size, decrease gradient_accumulation_steps by the same factor.
    gradient_accumulation_steps = gradient_accumulation_steps,

    # NOTE: Evaluation params
    # wandb is great for tracking experiments, it will even (try to) save your code nowadays
    evaluation_strategy = 'steps',
    eval_steps = eval_steps,
    save_steps = eval_steps,

    logging_first_step=True,
    logging_steps=eval_steps,
    report_to  = 'none',
)

trainer = Trainer(
    model = model, 
    args = training_args, 
    train_dataset = train_dataset, 
    eval_dataset = eval_dataset,
    data_collator = DataCollatorForLanguageModeling(tokenize_function, mlm=False),
)

# print amount of training steps, and how often the model is evaluated
print(f'''
    training for {num_train_epochs} epochs, {len(train_dataset)} samples
    {batch_size} batch size, {gradient_accumulation_steps} accumulation steps
    gives {_train_steps} training steps.

    evaluating every {eval_steps} steps, {len(eval_dataset)} samples 
    ''')


    training for 2 epochs, 2119719 samples
    16 batch size, 16 accumulation steps
    gives 8280 training steps.

    evaluating every 828 steps, 21990 samples 
    


In [13]:
#wandb.init(project='moe-gpt-neo', name=model_name, config=training_args)
trainer.train()
#trainer.save_model(f'./results/models/{model_name}')

tensor(0.0006, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0006, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0006, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0006, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0063, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0005, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0059, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0005, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0057, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0006, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0061, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0005, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0060, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0004, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0061, device='cuda:0', grad_fn=<AddBack

Step,Training Loss,Validation Loss


tensor(0.0008, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0075, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0010, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0079, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0009, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0072, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0008, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0072, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0009, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0077, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0009, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0079, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0008, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0073, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0010, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0073, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0008, device='cuda:0', grad_fn=<AddBackward0>)
tensor(0.0080, device='cuda:0', grad_fn=<AddBack

KeyboardInterrupt: 