In [1]:
!pip install transformers
!pip install datasets



In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling


In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import os

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"


# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can also use "gpt2-medium", "gpt2-large", etc.
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token # Use the end-of-sequence token as the padding token


model = GPT2LMHeadModel.from_pretrained(model_name)

# Load the dataset using the datasets library
def load_dataset_from_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    return {"text": lines}

# Create the dataset
train_file = "/content/sample_data/traintwo.txt"
dataset_dict = load_dataset_from_file(train_file)
train_dataset = load_dataset('text', data_files={'train': train_file})['train']


# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)


train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    report_to="none" # Disable wandb logging
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Step,Training Loss


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_path = "./gpt2-finetuned"  # Path where the fine-tuned model is saved
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Set up the text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Provide a prompt for the model to generate text
prompt = "Red Riding Hood set off through the forest"

# Generate text based on the prompt
output = generator(prompt, max_length=50, num_return_sequences=1)

# Print the generated text
print("Generated Text:")
print(output[0]['generated_text'])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Text:
Red Riding Hood set off through the forest.

She was walking with a blacksmith in the street, carrying a handful of old books.

"I found that the woods were dark; there was a tree with some vines in it.
