In [None]:
%pip install transformers torch datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from datasets import load_dataset

class CustomDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return text, input_ids, attention_mask

In [None]:
def compute_gradient_norm(model, input_ids, attention_mask):
    model.zero_grad()
    outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
    loss = outputs.loss
    loss.backward()
    total_norm = 0.0
    for param in model.parameters():
        if param.grad is not None:
            param_norm = param.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
    total_norm = total_norm ** 0.5
    return total_norm

def precompute_gradient_norms(dataloader, model):
    gradient_norms = []
    total_samples = len(dataloader)
    for i, (text, input_ids, attention_mask) in enumerate(dataloader):
        if input_ids.size(1) == 0:  # Skip empty input_ids
            continue

        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()

        grad_norm = compute_gradient_norm(model, input_ids, attention_mask)
        gradient_norms.append((grad_norm, text, input_ids, attention_mask))

        if i % 100 == 0:
            percent_complete = (i / total_samples) * 100
            print(f"Progress: {percent_complete:.2f}%")
    return gradient_norms

def train_on_selected_samples(selected_samples, model, optimizer):
    batch_size = 8
    for i in range(0, len(selected_samples), batch_size):
        batch_samples = selected_samples[i:i + batch_size]
        batch_input_ids = torch.cat([x[2] for x in batch_samples], dim=0)
        batch_attention_mask = torch.cat([x[3] for x in batch_samples], dim=0)

        # Forward pass and optimization
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f'Batch {i // batch_size}, Loss: {loss.item()}')

def train():
    # Load the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.train()
    model.cuda()  # Move model to GPU if available

    # Load the dataset
    dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
    train_texts = dataset['train']['text']

    # Prepare the custom dataset and dataloader
    train_dataset = CustomDataset(train_texts, tokenizer)
    dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # Precompute gradient norms for all samples
    gradient_norms = precompute_gradient_norms(dataloader, model)

    # Sort samples by gradient norm and select the top-k samples
    top_k = 1000  # Select top-k samples with the highest gradient norms
    gradient_norms.sort(reverse=True, key=lambda x: x[0])
    selected_samples = gradient_norms[:top_k]

    # Train on selected samples
    train_on_selected_samples(selected_samples, model, optimizer)

    return selected_samples

In [None]:
def run():
    selected_samples = train()

    # Analyze selected samples
    print("Analyzing selected samples based on gradient norm...")

    data = [(grad_norm, text) for grad_norm, text, _, _ in selected_samples]
    df = pd.DataFrame(data, columns=['Grad Norm', 'Text'])
    print(df)

    # Save DataFrame to a CSV file
    df.to_csv('selected_samples.csv', index=False)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

In [None]:
run()

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]



Progress: 0.27%
Progress: 0.54%
Progress: 0.82%
Progress: 1.09%
Progress: 1.91%
Progress: 2.45%
Progress: 3.27%
Progress: 4.09%
Progress: 4.36%
Progress: 4.63%
Progress: 5.17%
Progress: 5.45%
Progress: 5.99%
Progress: 6.26%
Progress: 6.54%
Progress: 6.81%
Progress: 7.08%
Progress: 7.35%
Progress: 7.63%
Progress: 8.17%
Progress: 8.44%
Progress: 8.72%
Progress: 9.26%
Progress: 9.53%
Progress: 10.08%
Progress: 10.35%
Progress: 11.71%
Progress: 11.98%
Progress: 12.26%
Progress: 12.53%
Progress: 12.80%
Progress: 13.07%
Progress: 13.34%
Progress: 13.62%
Progress: 13.89%
Progress: 14.16%
Progress: 14.43%
Progress: 14.98%
Progress: 15.52%
Progress: 15.80%
Progress: 16.07%
Progress: 16.34%
Progress: 17.16%
Progress: 17.43%
Progress: 17.70%
Progress: 19.34%
Progress: 19.61%
Progress: 20.15%
Progress: 20.43%
Progress: 20.70%
Progress: 21.24%
Progress: 21.52%
Progress: 21.79%
Progress: 22.06%
Progress: 22.33%
Progress: 22.88%
Progress: 23.42%
Progress: 23.69%
Progress: 23.97%
Progress: 24.51%
Prog

NameError: name 'pd' is not defined

In [None]:
def main():
    selected_samples = train()

    # Analyze selected samples
    print("Analyzing selected samples based on gradient norm...")
    grad_norms = []
    texts = []
    for grad_norm, text, _, _ in selected_samples:
        grad_norms.append(grad_norm)
        texts.append(text)
        print(f"Grad Norm: {grad_norm}, Text: {text}")

In [None]:
main()

tensor([], device='cuda:0', size=(1, 0))




RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.train()
model.cuda()  # Move model to GPU if available

# Load the dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
train_texts = dataset['train']['text']

# Prepare the custom dataset and dataloader
train_dataset = CustomDataset(train_texts, tokenizer)

dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 1
batch_size = 8
all_selected_samples = []
for epoch in range(num_epochs):
    selected_samples = []
    for i, (text, input_ids, attention_mask) in enumerate(dataloader):
      input_ids = input_ids.cuda()
      print(input_ids.shape)



torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 0])
torch.Size([1, 512])
torch.Size([1, 512])
torch.Size([1, 0])
torch.Size([1, 512])
t

KeyboardInterrupt: 