In [None]:
import csv
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from peft import get_peft_model, LoraConfig
from transformers import (
    GPT2LMHeadModel, GPT2Tokenizer, Trainer,
    TrainingArguments, DataCollatorForLanguageModeling
)

In [None]:
bible_text_dataset = load_dataset('text', data_files={'train': 'asv.txt'})
bible_text_dataset

In [None]:
bible_text_dataset['train'][3000]['text']

In [None]:
bible_qa_pairs = {
    'questions': [],
    'answers': []
}
with open('./BibleQA/data/bible_qa/1001.csv.csv') as bible_qa_pairs_csv_file:
    bible_qa_csv_file_reader = csv.reader(bible_qa_pairs_csv_file, delimiter='\t')
    header = next(bible_qa_csv_file_reader)
    for row in bible_qa_csv_file_reader:
        if row:
            bible_qa_pairs['questions'].append(
                ''.join(row[0].split('. ')[1:]))
            bible_qa_pairs['answers'].append(
                ''.join(''.join(row[1].split('. ')[1:]).split(' (')[:1]))
bible_qa_pairs_df = pd.DataFrame(bible_qa_pairs)
bible_qa_pairs_df.head()

In [None]:
model_name = 'gpt2'
pretrained_model = GPT2LMHeadModel.from_pretrained(model_name)
fine_tuned_model = GPT2LMHeadModel.from_pretrained(model_name)
peft_model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_bible_text(examples):
    return tokenizer(
        examples['text'], truncation=True,
        padding='max_length', max_length=512
    )
tokenized_bible_text = bible_text_dataset.map(
    tokenize_bible_text, batched=True
)

In [None]:
def tokenize_bible_text(examples):
    return tokenizer(
        examples['text'], truncation=True,
        padding='max_length', max_length=512
    )
tokenized_bible_text = bible_text_dataset.map(
    tokenize_bible_text, batched=True
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2
)
trainer = Trainer(
    model=fine_tuned_model,
    args=training_args,
    train_dataset=tokenized_bible_text['train'],
    data_collator=data_collator
)
trainer.train()

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=['c_attn', 'c_proj'],
    lora_dropout=0.1,
    bias='none'
)
peft_model = get_peft_model(peft_model, lora_config)

trainer.model = peft_model
trainer.train()

In [None]:
def generate_answer(model: GPT2LMHeadModel, question: str) -> str:
    if not question.strip():
        return ''
    input_tokens = tokenizer.encode(
        question,
        return_tensors='pt'
    ).to(model.device)
    attention_mask = input_tokens.ne(
        tokenizer.pad_token_id
    ).to(model.device)
    model_output = model.generate(
        input_tokens,
        max_length=50,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=attention_mask
    )
    answer = tokenizer.decode(
        model_output[0],
        skip_special_tokens=True
    )
    return answer

In [None]:
pretrained_model_answers = []
fine_tuned_model_answers = []
peft_model_answers = []
for question in tqdm(bible_qa_pairs_df['questions']):
    pretrained_model_answers.append(
        generate_answer(pretrained_model, question))
    fine_tuned_model_answers.append(
        generate_answer(fine_tuned_model, question))
    peft_model_answers.append(
        generate_answer(peft_model, question))

bible_qa_pairs_df['pretrained_model_answers'] = pretrained_model_answers
bible_qa_pairs_df['fine_tuned_model_answers'] = fine_tuned_model_answers
bible_qa_pairs_df['peft_model_answers'] = peft_model_answers

bible_qa_pairs_df.head()