In [1]:
# Do all imports needed
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
import pandas as pd

In [2]:
# Use GPU if possible
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
# Mount google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# Go to proper directory
%cd /content/gdrive/MyDrive/Colab_Notebooks/Transformers

/content/gdrive/MyDrive/Colab_Notebooks/Transformers


In [6]:
# Load the dataset
file_path = '/content/gdrive/MyDrive/Colab_Notebooks/Transformers/Election-Related_Questions_with_Answers.csv'
data = pd.read_csv(file_path)

# Explore the dataset
print(data.head())


                                            Question  \
0  What percentage of support does Kamala Harris ...   
1  Is the support for Kamala Harris in the primar...   
2  How does Kamala Harris's current support compa...   
3  What percentage of support does Gretchen Whitm...   
4  Is the support for Gretchen Whitmer in the pri...   

                                              Answer  
0  Kamala Harris has 77.0% support among DEM vote...  
1  Yes, the support for Kamala Harris in the prim...  
2  This requires comparing Kamala Harris's 77.0% ...  
3  Gretchen Whitmer has 3.0% support among DEM vo...  
4  Yes, the support for Gretchen Whitmer in the p...  


In [7]:
# Make question and answer one line for training
data['text'] = data['Question'] + " " + data['Answer']


In [8]:
# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples, padding=True, truncation=True, max_length=512, return_tensors="pt")

tokenized_texts = data['text'].apply(tokenize_function)

In [9]:
class ProxyTuningDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = [item['input_ids'].squeeze(0) for item in tokenized_data]
        self.attention_mask = [item['attention_mask'].squeeze(0) for item in tokenized_data]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
        }

# Create Dataset and DataLoader
dataset = ProxyTuningDataset(tokenized_texts)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=data_collator)


In [10]:
optimizer = AdamW(model.parameters(), lr=5e-5)
model.train()

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0
    batch_count = 0
    for batch in dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch, labels=batch["input_ids"])
        loss = outputs.loss
        total_loss += loss.item()
        batch_count += 1
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    average_loss = total_loss / batch_count
    print(f"Average loss for epoch {epoch + 1}: {average_loss:.4f}")

# Save the fine-tuned model and tokenizer
output_dir = "/content/gdrive/MyDrive/Colab_Notebooks/Transformers/fine_tuned_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")



Epoch 1/3
Average loss for epoch 1: 0.1548
Epoch 2/3
Average loss for epoch 2: 0.1413
Epoch 3/3
Average loss for epoch 3: 0.1403
Model and tokenizer saved to /content/gdrive/MyDrive/Colab_Notebooks/Transformers/fine_tuned_model
