In [None]:
!pip install transformers[sentencepiece]
!pip install datasets
!pip install evaluate



#Fine-Tuning a Pre-Trained Model

In [None]:
import transformers
import math
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForCausalLM

We are going to fine-tune a GPT-2 model on the HuggingFace OpenWebText dataset.

Select the pre-trained model to use.

In [None]:
checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(checkpoint)


# Preparing the Data

Load the OpenWebText dataset for fine-tuning. We split the dataset due to its large size.

In [None]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("openwebtext", split="train[:1000]")
raw_datasets

In [None]:
raw_datasets[8]

In [None]:
raw_datasets.column_names

Tokenize the whole dataset.

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"])


In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=raw_datasets.column_names)
tokenized_datasets

# Prepare the examples for input into the model

In [None]:
block_size = tokenizer.model_max_length

def group_texts(examples):
    concatenated_examples = {k: sum([ex for ex in examples[k] if isinstance(ex, list)], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

Each example groups into chunks and is now ready to be fed into the model.

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
lm_datasets

Converts the input token IDs back into text, in order to inspect the input to make sure it was correctly preprocessed.

In [None]:
tokenizer.decode(lm_datasets[1]["input_ids"])

# Fine-Tune the Model

Training the pre-trained model on the new dataset.

In [None]:
from transformers import TrainingArguments, Trainer

Define hyperparameters and other settings for finetuning the pre-trained model on the preprocessed dataset using the TrainingArguments class.

In [None]:
model_name = checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-finetuned-openwebtext",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False
)

Set up the trainer.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets,
    eval_dataset=lm_datasets,
)

Train.

In [None]:
trainer.train()

# Evaluation

Evaluate on the dataset.

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Inspecting examples from the dataset.

In [None]:
print(raw_datasets[300]['text'])

Testing our own samples from a user input for Subbreddit and Prompt.

In [None]:
user_input_text = input("Enter text prompt: ")

# Tokenize the input with attention_mask
inputs = tokenizer(user_input_text, return_tensors='pt', padding=True)
input_ids = inputs.input_ids.to('cuda')
attention_mask = inputs.attention_mask.to('cuda')

# Generate text
output = model.generate(
    input_ids,
    attention_mask=attention_mask,  # Added attention mask
    max_length=100,
    num_return_sequences=1,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("\nGenerated text:")
print(generated_text)