In [1]:
!pip install transformers sentence-transformers datasets



In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import re
from datasets import Dataset
import pandas as pd

###Data Extraction

In [4]:
def extract_qa_pairs_and_create_dataset():
    df = pd.read_csv('/kaggle/input/stackoverflow-python/qa_stackoverflow_python_high_score_1.csv')
    data = df.apply(lambda row: {"question": row['Question'], "context": row['Answer']}, axis=1).tolist()

    return Dataset.from_list(data)

qa_dataset = extract_qa_pairs_and_create_dataset()
print(qa_dataset)

Dataset({
    features: ['question', 'context'],
    num_rows: 18659
})


In [5]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for the dataset
def generate_embeddings(dataset):
    # Encode questions and contexts in batches
    question_embeddings = embedding_model.encode(dataset["question"], convert_to_tensor=True, batch_size=16)
    context_embeddings = embedding_model.encode(dataset["context"], convert_to_tensor=True, batch_size=16)
    return question_embeddings, context_embeddings

# Example: Generate embeddings
question_embeddings, context_embeddings = generate_embeddings(qa_dataset)
print("Embeddings generated successfully!")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1167 [00:00<?, ?it/s]

Batches:   0%|          | 0/1167 [00:00<?, ?it/s]

Embeddings generated successfully!


In [6]:
# Format the dataset for GPT-2 fine-tuning
def format_qa_for_gpt2(dataset):
    formatted_data = []
    for question, context in zip(dataset["question"], dataset["context"]):
        formatted_data.append({
            "prompt": f"Question: {question}\nAnswer:",
            "completion": context
        })
    return Dataset.from_list(formatted_data)

# Example: Format dataset
formatted_qa_dataset = format_qa_for_gpt2(qa_dataset)
formatted_qa_dataset.save_to_disk("formatted_qa_dataset")
print("Formatted dataset saved to disk.")

Saving the dataset (0/1 shards):   0%|          | 0/18659 [00:00<?, ? examples/s]

Formatted dataset saved to disk.


###Training

In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_from_disk
import wandb
wandb.init(mode="disabled")
# Load formatted dataset
formatted_qa_dataset = load_from_disk("formatted_qa_dataset")

# Tokenizer and model setup
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["prompt"] + examples["completion"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = formatted_qa_dataset.map(tokenize_function, batched=True, remove_columns=formatted_qa_dataset.column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/18659 [00:00<?, ? examples/s]



Step,Training Loss
100,2.6003
200,2.4697
300,2.4328
400,2.392
500,2.375
600,2.3758
700,2.3113
800,2.2937
900,2.3204
1000,2.2523


TrainOutput(global_step=13995, training_loss=2.1076025589060126, metrics={'train_runtime': 8998.88, 'train_samples_per_second': 12.441, 'train_steps_per_second': 1.555, 'total_flos': 2.9252688150528e+16, 'train_loss': 2.1076025589060126, 'epoch': 3.0})

In [None]:
trainer.save_model("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")