In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import get_cosine_schedule_with_warmup, AdamW
from datasets import load_dataset
from tqdm import tqdm
import random

In [3]:
# Load WikiText-103 dataset
dataset = load_dataset("wikitext", "wikitext-103-v1")


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/722k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoTokenizer

# Initialize RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Remove empty or whitespace-only entries
dataset = dataset.filter(
    lambda x: x["text"] is not None and len(x["text"].strip()) > 0
)

# Tokenization function with MLM masking
def tokenize_function(examples):
    inputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=128,
        padding="max_length",
        return_tensors="pt",
    )

    labels = inputs["input_ids"].clone()  # Copy input_ids for labels

    rand = torch.rand(labels.shape)
    mask_token_id = tokenizer.mask_token_id
    vocab_size = tokenizer.vocab_size

    mask_arr = (rand < 0.15) & (labels != tokenizer.pad_token_id) & (labels != tokenizer.cls_token_id) & (labels != tokenizer.sep_token_id)

    # 80%: Replace with [MASK]
    mask_indices = mask_arr & (torch.rand(labels.shape) < 0.8)
    inputs["input_ids"][mask_indices] = mask_token_id

    # 10%: Replace with random token
    random_indices = mask_arr & (torch.rand(labels.shape) < 0.1)
    inputs["input_ids"][random_indices] = torch.randint(0, vocab_size, labels.shape, dtype=torch.long)[random_indices]

    # 10%: Keep original token (handled automatically by `labels`)

    inputs["labels"] = labels  # MLM Labels
    inputs["attention_mask"] = (inputs["input_ids"] != tokenizer.pad_token_id).long()  # Padding Mask
    return inputs


# Tokenize the dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],  # Remove the original text column
)

# Print sample tokenized output
print(tokenized_dataset["train"][0])


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/1165029 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

{'input_ids': [0, 5457, 468, 44068, 6374, 41674, 6395, 50264, 50264, 50118, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [0, 5457, 468, 44068, 6374, 41674, 6395, 5457, 1437, 50118, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [5]:
tokenized_dataset.save_to_disk("/kaggle/working/tokenized_roberta")

Saving the dataset (0/1 shards):   0%|          | 0/2891 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/1165029 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2461 [00:00<?, ? examples/s]

In [6]:
# Convert dataset to PyTorch tensors
class WikiTextDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.dataset["input_ids"][idx]),
            "labels": torch.tensor(self.dataset["labels"][idx]),
            "attention_mask": torch.tensor(self.dataset["attention_mask"][idx]),
        }

train_dataset = WikiTextDataset(tokenized_dataset["train"])
valid_dataset = WikiTextDataset(tokenized_dataset["validation"])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64)

In [7]:
#Roberta model Definition

import math

class RoBERTaEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(max_len, embed_dim)
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids):
        seq_length = input_ids.shape[1]
        position_ids = torch.arange(seq_length, device=input_ids.device).expand_as(input_ids)
        embeddings = self.token_embedding(input_ids) + self.position_embedding(position_ids)
        return self.dropout(self.layer_norm(embeddings))

class MultiheadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, attention_mask):
        batch_size, seq_length, embed_dim = x.size()
        qkv = self.qkv_proj(x).reshape(batch_size, seq_length, self.num_heads, 3 * self.head_dim)
        q, k, v = torch.chunk(qkv, 3, dim=-1)

        q, k, v = [tensor.transpose(1, 2) for tensor in (q, k, v)]

        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if attention_mask is not None:
            scores = scores.masked_fill(attention_mask[:, None, None, :] == 0, float('-inf'))

        attn_weights = torch.nn.functional.softmax(scores, dim=-1)
        attn_output = (attn_weights @ v).transpose(1, 2).reshape(batch_size, seq_length, embed_dim)

        return self.o_proj(self.dropout(attn_output))

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, hidden_dim):
        super().__init__()
        self.attn = MultiheadSelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, embed_dim),
            nn.Dropout(0.1)
        )
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x, attention_mask):
        x = self.norm1(x + self.attn(x, attention_mask))
        x = self.norm2(x + self.ff(x))
        return x

class RoBERTa(nn.Module):
    def __init__(self, vocab_size, max_len, embed_dim=128, num_heads=4, hidden_dim=128, num_layers=4):
        super().__init__()
        self.embedding = RoBERTaEmbedding(vocab_size, embed_dim, max_len)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, num_heads, hidden_dim) for _ in range(num_layers)])
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        for layer in self.layers:
            x = layer(x, attention_mask)
        return self.lm_head(x)

In [8]:
#Training Setup

import torch
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")
print("Current Device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

vocab_size = 30522
max_len = 128

model = RoBERTa(vocab_size, max_len).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
scheduler = get_cosine_schedule_with_warmup(
    optimizer, num_warmup_steps=500, num_training_steps=len(train_loader) * 4  # Assuming 4 epochs
)

criterion = nn.CrossEntropyLoss()

CUDA Available: True
Device Name: Tesla P100-PCIE-16GB
Current Device: 0


In [1]:
num_epochs = 4
grad_clip = 1.0  # Gradient clipping threshold

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        print("sdgver", fluch=True)
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        
        loss = criterion(outputs.view(-1, vocab_size), labels.view(-1))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)  # Prevent exploding gradients
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()

        # Print loss every 100 iterations
        if (i + 1) % 100 == 0:
            avg_loss = total_loss / (i + 1)
            print(f"Iteration {i+1}: Loss = {avg_loss:.4f}")

    print(f"Epoch {epoch+1} completed. Average Loss = {total_loss / len(train_loader):.4f}")


NameError: name 'model' is not defined