## Preparation

In [2]:
# Code reference: https://huggingface.co/docs/transformers/tasks/language_modeling

# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

! pip install transformers[torch]
! pip install accelerate -U
! pip install trl

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [3]:
import os
import json

from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file_path = 'drive/MyDrive/SNLP/emotion_alpaca_v4_cleaned.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [6]:
dataset = Dataset.from_dict({
    'instruction': [item['instruction'] for item in data],
    'input': [item['input'] for item in data],
    'output': [item['rewritten_output'] for item in data]
})

In [34]:
dataset = dataset.train_test_split(test_size=0.1)

In [37]:
dataset['train'][0]

{'instruction': 'Create a task list for an imaginary company.',
 'input': '',
 'output': 'Relieved Task List for Imaginary Company:\n\n1. Hold regular staff meeting reminders for each team member.\n\n2. Automatically update our website with fresh content using predefined templates and schedules.\n\n3. Allocate resources for monthly market research sessions to keep tabs on competitors and changing customer demands.\n\n4. Implement marketing strategies like social media campaigns, email blasts, and targeted ads to boost visibility and attract prospects.\n\n5. Follow up with interested clients via phone calls and emails.\n\n6. Evaluate quarterly sales reports to determine growth opportunities and adjustments needed.\n\n7. Establish connections with reliable supplier networks for smooth operations.\n\n8. Keep track of accounting tasks regularly and make sure all expenses are documented properly.\n\n9. Prepare comprehensive welcome packs and provide ongoing support for newly hired staff mem

In [38]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [10]:
def formatting_func(batch):
    # Process each example in the batch and return a list of formatted strings
    return [
        f"{tokenizer.bos_token} Instruction: {instr} Input: {inp} Output: {out} {tokenizer.eos_token}"
        for instr, inp, out in zip(batch['instruction'], batch['input'], batch['output'])
    ]

In [11]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [12]:
model = AutoModelForCausalLM.from_pretrained("gpt2")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [28]:
LR = 5e-5            # Learning rate
PATIENCE = 10        # Patience for early stopping
BSZ = 4              # Batch size
EVAL_EVERY = 200     # Evaluate every X steps
SAVE_EVERY = 200     # Save model checkpoint every X steps
MAX_EPOCHS = 10      # Maximum number of epochs

In [39]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="gpt2_finetuned",
    evaluation_strategy="steps",
    learning_rate=LR,
    per_device_train_batch_size=BSZ,
    per_device_eval_batch_size=BSZ,
    num_train_epochs=MAX_EPOCHS,
    eval_steps=EVAL_EVERY,
    save_steps=SAVE_EVERY,
    load_best_model_at_end=True,
    gradient_accumulation_steps=2,
    fp16=True
)

# Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
    max_seq_length=1024,
    formatting_func=formatting_func
)

Map:   0%|          | 0/18798 [00:00<?, ? examples/s]

Map:   0%|          | 0/2089 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss
200,No log,2.76101
400,No log,2.73323
600,2.897800,2.702891
800,2.897800,2.682132
1000,2.806000,2.663967
1200,2.806000,2.649289


In [None]:
model.save_pretrained("./model_folder")
tokenizer.save_pretrained("./model_folder")

In [None]:
eval_results = trainer.evaluate(eval_dataset=dataset['test'])
print(eval_results)

In [None]:
from transformers import pipeline

# Load the trained model into a pipeline for easy inference
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example prompt
prompt = "The capital of France is"
generated = pipe(prompt, max_length=50, num_return_sequences=1)

print("Input Prompt:", prompt)
print("Generated Text:", generated[0]["generated_text"])