In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [4]:
import pandas as pd
from transformers import GPT2Tokenizer

# Load the CSV data
data = pd.read_csv("buddhist_q_a.csv")

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Preprocess the data
def preprocess_qa(q, a):
    return f"Question: {q}\nAnswer: {a}\n"

# Combine the questions and answers using the preprocess function
text_data = [preprocess_qa(row["question"], row["answer"]) for _, row in data.iterrows()]

# Tokenize the text data
tokenized_data = [tokenizer.encode(text) for text in text_data]

In [9]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, AdamW

# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts)

    def __getitem__(self, idx):
        return self.tokenized_texts[idx]

# Custom Collator class
class Collator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        inputs = [torch.tensor(item, dtype=torch.long) for item in batch]
        inputs = pad_sequence(inputs, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        return inputs

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Training parameters
batch_size = 4
epochs = 6
lr = 1e-5

# Load the pre-trained GPT-2 model
config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)

# Create the Dataset and DataLoader
dataset = TextDataset(tokenized_data)
collator_fn = Collator(tokenizer)
train_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collator_fn)

# Move the model to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=lr)

# Training loop
model.train()
for epoch in range(epochs):
    for batch in train_dataloader:
        # Move the batch to the device
        batch = batch.to(device)

        # Forward pass
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update the weights
        optimizer.step()
        optimizer.zero_grad()

        # Print the loss
        print(f"Loss: {loss.item()}")


Loss: 4.492715835571289
Loss: 6.328117847442627
Loss: 5.293035984039307
Loss: 4.846797466278076
Loss: 4.42454719543457
Loss: 3.9342041015625
Loss: 3.9833872318267822
Loss: 3.821706771850586
Loss: 3.3678717613220215
Loss: 3.158482313156128
Loss: 2.8140087127685547
Loss: 2.785637378692627
Loss: 2.922175168991089
Loss: 2.276271104812622
Loss: 2.1758272647857666
Loss: 2.66987943649292
Loss: 2.355759620666504
Loss: 2.5381062030792236
Loss: 2.6413075923919678
Loss: 2.678985834121704
Loss: 2.123674154281616
Loss: 2.1510376930236816
Loss: 2.3437981605529785
Loss: 2.2679924964904785
Loss: 2.4316518306732178
Loss: 2.6425180435180664
Loss: 1.8036822080612183
Loss: 1.3015156984329224
Loss: 2.0854525566101074
Loss: 1.9218826293945312
Loss: 2.1241402626037598
Loss: 2.2575793266296387
Loss: 2.3994438648223877
Loss: 1.9485573768615723
Loss: 2.0097875595092773
Loss: 2.1713318824768066
Loss: 2.0759596824645996
Loss: 2.159410238265991
Loss: 2.4755184650421143
Loss: 1.5593750476837158
Loss: 1.240760684013

In [10]:
model.save_pretrained("fine_tuned_gpt2_buddhism_ai")