In [1]:
# pip install transformers datasets torch

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import torch




In [3]:
# 1. Load the text document
def load_text_file(file_path):  #encoding='utf-8'
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# 2. Prepare the dataset


In [4]:
def prepare_dataset(text, tokenizer, block_size=128):
    """
    Prepare a tokenized dataset from the input text.
    Splits the text into chunks of `block_size` and tokenizes each chunk.
    """
    # Split the text into chunks of block_size
    tokenized = tokenizer(text, return_tensors="pt", truncation=False)
    input_ids = tokenized["input_ids"].squeeze()

    # Create chunks
    chunked_input_ids = [
        input_ids[i:i + block_size]
        for i in range(0, len(input_ids), block_size)
        if len(input_ids[i:i + block_size]) == block_size
    ]

    # Convert chunks to Dataset format
    dataset = Dataset.from_dict({"input_ids": chunked_input_ids})
    return dataset


In [5]:
def fine_tune_gpt2(dataset, model, tokenizer, output_dir="./model"):
    """
    Fine-tune the GPT-2 model on a given dataset.
    """
    # Check the dataset
    if len(dataset) == 0:
        raise ValueError("The dataset is empty. Ensure the input text is correctly processed.")

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=1,
        save_steps=1000,
        save_total_limit=2,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="no",
        learning_rate=5e-5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_first_step=True,
        report_to="none"  # Disable reporting to external services
    )

    # Define data collator for padding
    def data_collator(features):
        batch = tokenizer.pad(features, padding=True, return_tensors="pt")
        return {"input_ids": batch["input_ids"], "labels": batch["input_ids"]}

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator
    )

    # Fine-tune the model
    trainer.train()

    # Save the fine-tuned model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)


# 3. Fine-tune GPT-2

In [6]:
def interactive_qa(model, tokenizer):
    """
    Interact with the fine-tuned model by asking questions.
    """
    print("\n--- Start Asking Questions (type 'quit' to exit) ---")
    while True:
        question = input("Q: ")
        if question.lower() == 'quit':
            break
        inputs = tokenizer.encode(question, return_tensors="pt")
        outputs = model.generate(inputs, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"A: {answer}\n")

In [None]:
# Main script
if __name__ == "__main__":
    # Load the pretrained GPT-2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

    
    # Load your text file
    text = load_text_file("koedia_custom_dataset.txt")
    print(text)
    # Prepare dataset
    dataset = prepare_dataset(text, tokenizer)

    print(dataset)
    # Fine-tune the model
    fine_tune_gpt2(dataset, model, tokenizer)

    # Use the fine-tuned model for interactive Q&A
    interactive_qa(model, tokenizer)

Question: what is the the name of  the capital city of Pakistan?
Answer: Islamabad (City of Islam) is the capital city of Pakistan.[9] It is the country's tenth-most populous city
with a population of 1,108,872 people[5][10] and is federally administered by the Pakistani government
as part of the Islamabad Capital Territory. Built as a planned city in the 1960s and established in 1967,
it replaced Karachi as Pakistan's national capital.
Question: Who made the master plan of Islamabad?
Answer: The Greek architect Constantinos Apostolou Doxiadis developed Islamabad's master plan, in which he
divided it into eight zones; the city comprises administrative, diplomatic enclave, residential areas,
educational and industrial sectors, commercial areas, as well as rural and green areas administered
by the Islamabad Metropolitan Corporation with support from the Capital Development Authority.
Islamabad is known for its parks and forests, including the Margalla Hills National
Park and the Shakarpa



Step,Training Loss
1,78.0424
