# This demo shows how to train GPT-NeoX from scratch

In [None]:
import sys

sys.path.append("..")  # ensure we can run examples as-is in the package's poetry env

In [None]:
import pandas as pd
import torch
import transformers
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DebertaPreTrainedModel,
    GPTNeoXConfig,
    GPTNeoXPreTrainedModel,
    TrainingArguments,
)
from utils import compute_classification_metrics

from grouphug import AutoMultiTaskModel, ClassificationHeadConfig, DatasetFormatter, LMHeadConfig, MultiTaskTrainer

In [None]:
config = GPTNeoXConfig(
    hidden_size=768, intermediate_size=3072, num_attention_heads=12, num_hidden_layers=12, is_decoder=True
)

In [None]:
tweet_emotion = load_dataset("tweet_eval", "emotion").rename_column("label", "emotion")
wiki_data = load_dataset("wikitext", "wikitext-2-v1")

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
formatter = DatasetFormatter().tokenize()
data = formatter.apply(tweet_emotion, tokenizer=tokenizer)

In [None]:
head_configs = [
    LMHeadConfig(causal_language_modelling=True),
    ClassificationHeadConfig.from_data(data, "emotion", classifier_hidden_size=32),
]
model = AutoMultiTaskModel.from_config(config, head_configs=head_configs, tokenizer=tokenizer, formatter=formatter)

In [None]:
training_args = TrainingArguments(
    output_dir="../output",
    evaluation_strategy="epoch",
    num_train_epochs=10,
    save_strategy="no",
)
trainer = MultiTaskTrainer(
    model=model,
    tokenizer=tokenizer,
    train_data=data[:, "train"],
    eval_data=data[:, "test"],
    eval_heads=["emotion"],
    compute_metrics=compute_classification_metrics,
    args=training_args,
)
trainer.train()