In [1]:
import wandb
import torch
from datasets import Dataset
from transformers import GPTNeoXForCausalLM, AutoTokenizer, Trainer, TrainingArguments, PushToHubCallback, DataCollatorForLanguageModeling
from huggingface_hub import HfFolder
import pandas as pd

# Log in to Hugging Face
HfFolder.save_token('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')

# Initialize wandb
wandb.init(project='model_finetuning')

[34m[1mwandb[0m: Currently logged in as: [33mmarcomolinari4[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Load dataset

In [2]:
df = pd.read_csv('sae-transfer-learning/data/training/train.csv')

In [3]:
dataset = Dataset.from_pandas(df)

## Tokenize

In [4]:
model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step143000",
  cache_dir="./pythia-70m-deduped/step143000",
)

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step143000",
  cache_dir="./pythia-70m-deduped/step143000",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def tokenize_function(examples):
    return tokenizer(examples['prompt'], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

### Load Eval

In [7]:
eval = pd.read_csv('sae-transfer-learning/data/training/eval.csv')

In [8]:
eval_dataset = Dataset.from_pandas(df)

In [9]:
def tokenize_function(examples):
    return tokenizer(examples['prompt'], padding="max_length", truncation=True)

eval_tokenized_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

## Fine tune

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    optim="adamw_torch",
    #fp16=True,
    report_to="wandb",
    push_to_hub=True,
    hub_strategy="end",
    save_strategy="no",  # Prevents local saving of the model checkpoints.
    logging_dir="./logs",  # Optional: Specify where to log events locally.
    logging_steps=10,  # Log metrics every 10 steps. Adjust as needed to see logs more frequently.
    log_level='info'  # Set log level to 'info' to ensure detailed logs are visible.
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
    data_collator=data_collator
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPTNeoXForCausalLM.forward` and have been ignored: input, instruction, prompt, output, Unnamed: 0. If input, instruction, prompt, output, Unnamed: 0 are not expected by `GPTNeoXForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,000
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12,500
  Number of trainable parameters = 70,426,624
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,2.7972,2.517712
2,2.4668,2.257811
3,2.2967,2.037624
4,2.06,1.819342
5,1.8328,1.643001
6,1.6048,1.464897


The following columns in the evaluation set don't have a corresponding argument in `GPTNeoXForCausalLM.forward` and have been ignored: input, instruction, prompt, output, Unnamed: 0. If input, instruction, prompt, output, Unnamed: 0 are not expected by `GPTNeoXForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `GPTNeoXForCausalLM.forward` and have been ignored: input, instruction, prompt, output, Unnamed: 0. If input, instruction, prompt, output, Unnamed: 0 are not expected by `GPTNeoXForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `GPTNeoXForCausalLM.forward` and have been ignored: input, instruction, prompt, output, Unnamed: 0. If input, instruction, prompt, outp

In [13]:
model.push_to_hub("marco-molinari/results")

Configuration saved in results/config.json
Configuration saved in results/generation_config.json
Model weights saved in results/model.safetensors
Uploading the following files to marco-molinari/results: README.md,config.json,generation_config.json,model.safetensors


model.safetensors:   0%|          | 0.00/282M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/marco-molinari/results/commit/ef2eca8e7cf705d45be868ea1d8c21ec84699a4d', commit_message='Upload GPTNeoXForCausalLM', commit_description='', oid='ef2eca8e7cf705d45be868ea1d8c21ec84699a4d', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPTNeoXForCausalLM.forward` and have been ignored: input, instruction, prompt, output, Unnamed: 0. If input, instruction, prompt, output, Unnamed: 0 are not expected by `GPTNeoXForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8


{'eval_loss': 1.1920833587646484,
 'eval_runtime': 77.3347,
 'eval_samples_per_second': 129.308,
 'eval_steps_per_second': 16.164,
 'epoch': 10.0}

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [33]:
prompt = "User: Name 3 vegetables. Assistant: "

inputs_prompt = tokenizer(prompt, return_tensors='pt')

input_ids = inputs_prompt.input_ids.to(device)
attention_mask = inputs_prompt.attention_mask.to(device)

output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=20, num_return_sequences=1)
tokenizer.decode(output[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


'User: Name 3 vegetables. Assistant: \n1. Mushrooms\n2. Broccoli\n3. Asparagus\n4. B'