In [1]:
!pip install -q -U datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import torch
import multiprocessing
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from typing import Dict
import matplotlib.pyplot as plt

# Suppress warnings if necessary (you can remove this if you want to see all warnings).
import warnings
warnings.filterwarnings("ignore")

# Constants
DATASET_NAME = 'fka/awesome-chatgpt-prompts'  # Replace with the actual dataset name
TOKENIZER_NAME = 'gpt2'  # Replace with the actual tokenizer name
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_WORKERS = multiprocessing.cpu_count()

# Set device based on CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset
dataset = load_dataset(DATASET_NAME)

# Print dataset information
print('Available splits:', list(dataset.keys()))
print('Dataset:', dataset)

# Define tokenizer with padding token
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
tokenizer.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id

def preprocess_function(examples: Dict[str, list]) -> Dict[str, torch.Tensor]:
    concatenated_examples = [' '.join(str(examples[col][i]) for col in examples if isinstance(examples[col][0], str))
                             for i in range(len(examples[next(iter(examples))]))]
    tokenized_inputs = tokenizer(concatenated_examples, padding='max_length', truncation=True,
                                 max_length=MAX_LENGTH, return_tensors='pt')
    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask']
    }

# Preprocessing and tokenization
for split in dataset.keys():
    dataset[split] = dataset[split].map(
        preprocess_function,
        batched=True,
        num_proc=min(NUM_WORKERS, 4),
        remove_columns=dataset[split].column_names
    )

# Set format for PyTorch
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Prepare DataLoader
dataloaders = {split: DataLoader(
    dataset[split],
    batch_size=BATCH_SIZE,
    shuffle=(split == 'train'),
    collate_fn=lambda batch: {k: torch.stack([b[k] for b in batch]).to(device) for k in batch[0]},
    num_workers=NUM_WORKERS
) for split in dataset.keys()}

# Display the first example of all splits
for split in dataset.keys():
    print(f"First example of the '{split}' split:")
    first_example = next(iter(dataloaders[split]))
    print({key: val[0].cpu().tolist() for key, val in first_example.items()})

# Initialize model
model = AutoModelForCausalLM.from_pretrained(TOKENIZER_NAME).to(device)
model.config.pad_token_id = tokenizer.pad_token_id

# Define loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)

# Training loop
train_loss, valid_loss, test_loss = [], [], []
for epoch in range(10):
    for split in ['train']:
        model.train() if split == 'train' else model.eval()
        running_loss = 0.0
        for i, data in enumerate(dataloaders[split], 0):
            inputs, labels = data['input_ids'], data['attention_mask']
            optimizer.zero_grad()
            outputs = model(inputs, attention_mask=labels)
            loss = criterion(outputs.logits.view(-1, model.config.vocab_size), labels.view(-1))
            if split == 'train':
                loss.backward()
                optimizer.step()
            running_loss += loss.item()
        print(f'Epoch: {epoch+1}, Split: {split}, Loss: {running_loss/i}')
        (train_loss if split == 'train' else valid_loss).append(running_loss/i)

# Evaluation on test data
model.eval()
with torch.no_grad():
    for i, data in enumerate(dataloaders['test'], 0):
        inputs, labels = data['input_ids'], data['attention_mask']
        outputs = model(inputs, attention_mask=labels)
        loss = criterion(outputs.logits.view(-1, model.config.vocab_size), labels.view(-1))
        test_loss.append(loss.item())

# Plot training, validation, and test losses
plt.figure(figsize=(12, 6))
plt.plot(train_loss, label='Training loss')
plt.plot(valid_loss, label='Validation loss')
plt.plot(test_loss, label='Test loss')
plt.legend()
plt.show()

# Save the trained model
model.save_pretrained('./model_save_directory')
tokenizer.save_pretrained('./model_save_directory')

Available splits: ['train']
Dataset: DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 153
    })
})
First example of the 'train' split:
{'input_ids': [5377, 1930, 263, 314, 765, 345, 284, 719, 355, 257, 26777, 13, 314, 481, 2148, 262, 15844, 284, 257, 3496, 290, 345, 481, 2251, 2647, 329, 340, 13, 770, 714, 2291, 1262, 2972, 12834, 393, 4899, 11, 884, 355, 24983, 11341, 393, 6072, 489, 364, 11, 287, 1502, 284, 2251, 47077, 290, 4419, 17300, 326, 2222, 262, 15844, 284, 1204, 13, 2011, 717, 2581, 318, 366, 40, 423, 3194, 257, 21247, 3706, 564, 250, 31306, 282, 316, 37918, 37718, 320, 447, 251, 290, 761, 2647, 284, 467, 351, 340, 526, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
import os
import torch
import multiprocessing
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from typing import Dict
import torch.optim as optim
import matplotlib.pyplot as plt

# Suppress unnecessary warnings (optional)
import warnings
warnings.filterwarnings("ignore")

# Constants
DATASET_NAME = 'fka/awesome-chatgpt-prompts'  # Replace with the actual dataset name
TOKENIZER_NAME = 'gpt2'  # Replace with the actual tokenizer name
MAX_LENGTH = 128  # Maximum sequence length
BATCH_SIZE = 16  # Batch size
NUM_WORKERS = multiprocessing.cpu_count()  # Number of workers for DataLoader

# Set device based on CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset and tokenizer
dataset = load_dataset(DATASET_NAME)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

# Ensure that tokenizer has a padding token
tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token

# Define the collate function
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked Language Modeling set to False since we use causal language modeling
    return_tensors="pt"
)

# Tokenization and dataset preparation function
def tokenize_function(examples):
    concatenated_examples = {
        'text': [' '.join(ex[feature] for feature in examples if isinstance(ex[feature], str))
                 for ex in zip(*examples.values())]
    }
    return tokenizer(
        concatenated_examples['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH
    )

# Apply tokenization to all splits
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset['train'].column_names
)

# DataLoaders
dataloaders: Dict[str, DataLoader] = {
    split: DataLoader(
        tokenized_datasets[split],
        shuffle=(split == 'train'),
        collate_fn=data_collator,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS
    )
    for split in dataset.keys()
}

# Model definition
model = AutoModelForCausalLM.from_pretrained(TOKENIZER_NAME).to(device)
model.resize_token_embeddings(len(tokenizer))

# Training setup
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.01)

# Training loop
def train_model(dataloaders, model, optimizer, criterion, epochs=10):
    train_loss, valid_loss, test_loss = [], [], []

    for epoch in range(epochs):
        for split, dataloader in dataloaders.items():
            if split == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            for data in dataloader:
                inputs, labels = data['input_ids'].to(device), data['labels'].to(device)

                optimizer.zero_grad()
                outputs = model(inputs, labels=labels)
                loss = criterion(outputs.logits.view(-1, model.config.vocab_size), labels.view(-1))
                running_loss += loss.item()

                if split == 'train':
                    loss.backward()
                    optimizer.step()

            average_loss = running_loss / len(dataloader)
            print(f'Epoch:{epoch+1}, Split:{split}, Loss:{average_loss}')
            if split == 'train':
                train_loss.append(average_loss)
            elif split == 'validation':
                valid_loss.append(average_loss)
            else:
                test_loss.append(average_loss)

    return train_loss, valid_loss, test_loss

# Run the training and validation
train_loss, valid_loss, test_loss = train_model(dataloaders, model, optimizer, criterion)

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(train_loss, label='Training loss')
plt.plot(valid_loss, label='Validation loss')
plt.plot(test_loss, label='Test loss')
plt.legend()
plt.show()

# Save the model
model.save_pretrained('path_to_save_model')
tokenizer.save_pretrained('path_to_save_tokenizer')