In [1]:
!wget "https://huggingface.co/spaces/nnsohamnn/Conv_GPT/resolve/main/Conv_GPT.pth"

--2025-03-30 19:58:06--  https://huggingface.co/spaces/nnsohamnn/Conv_GPT/resolve/main/Conv_GPT.pth
Resolving huggingface.co (huggingface.co)... 3.163.189.90, 3.163.189.114, 3.163.189.37, ...
Connecting to huggingface.co (huggingface.co)|3.163.189.90|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/84/f0/84f0e4e90ae2a049be03456e0d9bdf6a60a4303d4a1c3993fe02d7554d78df6a/d07006505c691bae29120861fbc9dfe9ad3b75d4964e38b8445020991d4d6b17?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27Conv_GPT.pth%3B+filename%3D%22Conv_GPT.pth%22%3B&Expires=1743368286&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MzM2ODI4Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzg0L2YwLzg0ZjBlNGU5MGFlMmEwNDliZTAzNDU2ZTBkOWJkZjZhNjBhNDMwM2Q0YTFjMzk5M2ZlMDJkNzU1NGQ3OGRmNmEvZDA3MDA2NTA1YzY5MWJhZTI5MTIwODYxZmJjOWRmZTlhZDNiNzVkNDk2NGUzOGI4NDQ1MDIwOTkxZDRkNmIxNz9yZXNwb25zZS1jb250ZW50LWRp

In [2]:
!pip install transformers datasets tqdm



In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
from datasets import load_dataset
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
import random

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
vocab_size = tokenizer.vocab_size

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, dropout=0.1):
        super().__init__()
        assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads

        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, hidden_size)

        self.dropout = nn.Dropout(dropout)
        self.scale = math.sqrt(self.head_dim)

    def forward(self, x, mask=None, padding_mask=None):
        batch_size, seq_len, _ = x.size()

        q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale

        if mask is not None:
            scores = scores.masked_fill(mask == 1, -1e4)
        if padding_mask is not None:
            padding_mask = padding_mask.unsqueeze(1).unsqueeze(2)
            scores = scores.masked_fill(padding_mask, -1e4)

        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        out = torch.matmul(attn, v).transpose(1, 2).contiguous().view(batch_size, seq_len, self.hidden_size)
        out = self.out(out)
        return out

class TransformerLayer(nn.Module):
    def __init__(self, hidden_size, num_heads, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadSelfAttention(hidden_size, num_heads, dropout)
        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, 4 * hidden_size),
            nn.ReLU(),
            nn.Linear(4 * hidden_size, hidden_size),
            nn.Dropout(dropout)
        )
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None, padding_mask=None):
        x = self.ln1(x)
        attn_out = self.attn(x, mask, padding_mask)
        x = x + self.dropout(attn_out)

        x = self.ln2(x)
        ffn_out = self.ffn(x)
        x = x + self.dropout(ffn_out)
        return x

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, hidden_size=512, num_layers=6, num_heads=8, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, hidden_size)
        self.pos_embedding = nn.Embedding(512, hidden_size)
        self.layers = nn.ModuleList([
            TransformerLayer(hidden_size, num_heads, dropout) for _ in range(num_layers)
        ])
        self.final_ln = nn.LayerNorm(hidden_size)
        self.head = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, padding_mask=None):
        batch_size, seq_len = input_ids.size()
        positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
        x = self.token_embedding(input_ids) + self.pos_embedding(positions)
        x = self.dropout(x)

        causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=input_ids.device), diagonal=1).bool()

        for layer in self.layers:
            x = layer(x, causal_mask, padding_mask)

        x = self.final_ln(x)
        logits = self.head(x)
        return logits

In [6]:
hidden_size = 512
num_layers = 12
num_heads = 16
dropout = 0.1

model = TransformerModel(vocab_size=vocab_size, hidden_size=hidden_size,
                         num_layers=num_layers, num_heads=num_heads,
                         dropout=dropout).to(device)

In [7]:
model_path = "Conv_GPT.pth"
model.load_state_dict(torch.load(model_path, map_location=device))
print("Pre-trained model loaded successfully!")

Pre-trained model loaded successfully!


In [8]:
dataset = load_dataset("blended_skill_talk", split="train")
print(f"Dataset loaded with {len(dataset)} examples")


max_examples = 120
dataset = dataset.select(range(min(len(dataset), max_examples)))
print(f"Using {len(dataset)} examples for fine-tuning")

Dataset loaded with 4819 examples
Using 120 examples for fine-tuning


In [9]:
def format_dialogue(example):
    formatted = []
    for i, utterance in enumerate(example['guided_messages']):
        if not utterance.strip():
            continue
        speaker = "User: " if i % 2 == 0 else "Assistant: "
        formatted.append(speaker + utterance)
    return "\n".join(formatted)

dialogues = []
for example in tqdm(dataset, desc="Formatting dialogues"):
    dialogue = format_dialogue(example)
    dialogues.append(dialogue)

Formatting dialogues: 100%|██████████| 120/120 [00:00<00:00, 5640.22it/s]


In [10]:
class DialogueDataset(Dataset):
    def __init__(self, dialogues):
        self.sequences = []
        for dialogue in tqdm(dialogues, desc="Tokenizing"):
            tokenized = tokenizer(dialogue, max_length=512, truncation=True, return_tensors='pt')['input_ids'][0]
            self.sequences.append(tokenized)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx]

def collate_fn(batch):
    lengths = [len(seq) for seq in batch]
    max_len = max(lengths)
    padded_seqs = torch.full((len(batch), max_len), tokenizer.pad_token_id, dtype=torch.long)
    for i, seq in enumerate(batch):
        padded_seqs[i, :len(seq)] = seq
    padding_mask = padded_seqs == tokenizer.pad_token_id
    return padded_seqs, padding_mask

In [11]:
fine_tune_dataset = DialogueDataset(dialogues)
fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5, eps = 1e-7)
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
scaler = torch.cuda.amp.GradScaler()

Tokenizing: 100%|██████████| 120/120 [00:00<00:00, 801.65it/s]
  scaler = torch.cuda.amp.GradScaler()


In [12]:
def train_epoch(model, loader, optimizer, scaler):
    model.train()
    total_loss, total_acc = 0, 0
    for input_ids, padding_mask in tqdm(loader, desc="Fine-tuning"):
        input_ids, padding_mask = input_ids.to(device), padding_mask.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(input_ids, padding_mask)
            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = input_ids[:, 1:].contiguous()
            loss = loss_fn(shift_logits.view(-1, vocab_size), shift_labels.view(-1))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        preds = shift_logits.argmax(dim=-1)
        mask = shift_labels != tokenizer.pad_token_id
        correct = (preds == shift_labels) & mask
        acc = correct.sum().float() / mask.sum().float()
        total_loss += loss.item()
        total_acc += acc.item()
    return total_loss / len(loader), total_acc / len(loader)

In [13]:
def evaluate(model, loader):
    model.eval()
    total_loss, total_acc = 0, 0
    with torch.no_grad():
        for input_ids, padding_mask in tqdm(loader, desc="Evaluating"):
            input_ids, padding_mask = input_ids.to(device), padding_mask.to(device)
            with torch.cuda.amp.autocast():
                logits = model(input_ids, padding_mask)
                shift_logits = logits[:, :-1, :].contiguous()
                shift_labels = input_ids[:, 1:].contiguous()
                loss = loss_fn(shift_logits.view(-1, vocab_size), shift_labels.view(-1))
            preds = shift_logits.argmax(dim=-1)
            mask = shift_labels != tokenizer.pad_token_id
            correct = (preds == shift_labels) & mask
            acc = correct.sum().float() / mask.sum().float()
            total_loss += loss.item()
            total_acc += acc.item()
    return total_loss / len(loader), total_acc / len(loader)

In [14]:
def generate_text(model, prompt, max_new_tokens=50):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    generated_ids = input_ids
    with torch.no_grad():
        for _ in range(max_new_tokens):
            logits = model(generated_ids, padding_mask=None)
            next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(1)
            generated_ids = torch.cat([generated_ids, next_token], dim=1)
            if next_token.item() == tokenizer.eos_token_id or tokenizer.decode(next_token.item()) == '\n':
                break
    response = tokenizer.decode(generated_ids[0, len(input_ids[0]):])
    return response.strip()

In [15]:
train_size = int(0.85 * len(fine_tune_dataset))
val_size = len(fine_tune_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(fine_tune_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

In [16]:
num_epochs = 5
train_losses, val_losses, train_accs, val_accs = [], [], [], []

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scaler)
    train_losses.append(train_loss)
    train_accs.append(train_acc)

    val_loss, val_acc = evaluate(model, val_loader)
    val_losses.append(val_loss)
    val_accs.append(val_acc)

    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}")

  with torch.cuda.amp.autocast():
Fine-tuning: 100%|██████████| 26/26 [00:02<00:00, 10.22it/s]
Evaluating: 100%|██████████| 5/5 [00:00<00:00, 46.53it/s]


Epoch 1: Train Loss=6.3302, Val Loss=5.5159, Train Acc=0.2218, Val Acc=0.2405


Fine-tuning: 100%|██████████| 26/26 [00:02<00:00, 12.53it/s]
Evaluating: 100%|██████████| 5/5 [00:00<00:00, 55.13it/s]


Epoch 2: Train Loss=4.9039, Val Loss=5.1479, Train Acc=0.2702, Val Acc=0.2617


Fine-tuning: 100%|██████████| 26/26 [00:02<00:00, 11.97it/s]
Evaluating: 100%|██████████| 5/5 [00:00<00:00, 47.21it/s]


Epoch 3: Train Loss=4.3426, Val Loss=4.9804, Train Acc=0.3015, Val Acc=0.2749


Fine-tuning: 100%|██████████| 26/26 [00:02<00:00, 11.54it/s]
Evaluating: 100%|██████████| 5/5 [00:00<00:00, 37.56it/s]


Epoch 4: Train Loss=3.9192, Val Loss=4.9306, Train Acc=0.3369, Val Acc=0.2775


Fine-tuning: 100%|██████████| 26/26 [00:02<00:00, 11.35it/s]
Evaluating: 100%|██████████| 5/5 [00:00<00:00, 53.90it/s]

Epoch 5: Train Loss=3.5134, Val Loss=4.9883, Train Acc=0.3768, Val Acc=0.2706





In [17]:
fine_tuned_model_path = "Conv_GPT_finetuned_blended_skill.pth"
torch.save(model.state_dict(), fine_tuned_model_path)
print(f"Fine-tuned model saved to {fine_tuned_model_path}")

Fine-tuned model saved to Conv_GPT_finetuned_blended_skill.pth


In [19]:
test_prompts = [
    "User: Do you like books?\nAssistant:",
    "User: How much the ticket cost?\nAssistant:",
    "User: What is your favorite hobby?.\nAssistant:",
    "User: What are you doing later?\nAssistant:"
]

print("\nTesting the fine-tuned model:")
for prompt in test_prompts:
    response = generate_text(model, prompt, max_new_tokens=100)
    print(f"\nPrompt: {prompt}")
    print(f"Response: {response}")
    print("-" * 50)


Testing the fine-tuned model:

Prompt: User: Do you like books?
Assistant:
Response: I like reading books. I like reading books.
--------------------------------------------------

Prompt: User: How much the ticket cost?
Assistant:
Response: I have a ticket for a week.
--------------------------------------------------

Prompt: User: What is your favorite hobby?.
Assistant:
Response: I like reading it as well but I enjoy watching reading and watching movies.
--------------------------------------------------

Prompt: User: What are you doing later?
Assistant:
Response: I'm going to eat and eat a lot of fruit .
--------------------------------------------------
