In [1]:
import torch
import transformers
import datasets
import sklearn

In [2]:
# Load GPT-2 tokenizer and model
tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2', device_map='auto')
# Add a padding token to GPT-2 tokenizer (since it doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
# Load a small subset of the IMDb dataset for binary sentiment classification
dataset = datasets.load_dataset('imdb', split='train[:25%]')
dataset = dataset.train_test_split(test_size=0.2)

In [6]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Apply the tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets.set_format('torch')

# Split into train and evaluation datasets
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

In [7]:
# Define a custom model with GPT-2 as feature extractor and a linear classifier on top
class GPT2ForClassification(torch.nn.Module):
    def __init__(self, gpt2, num_labels):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = gpt2
        # Freeze GPT-2 parameters
        for param in self.gpt2.parameters():
            param.requires_grad = False
        # Linear classifier
        self.classifier = torch.nn.Linear(self.gpt2.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        # Get hidden states from GPT-2
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        # Use the hidden state of the last token for classification
        last_token_indices = attention_mask.sum(dim=1) - 1
        pooled_output = outputs.last_hidden_state[torch.arange(input_ids.size(0)), last_token_indices]
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            # Compute loss
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))
        return {'loss': loss, 'logits': logits}

In [8]:
# Load GPT-2
gpt2 = transformers.GPT2Model.from_pretrained('gpt2', device_map='auto', torch_dtype='auto')

# Initialize the model
num_labels = 2  # Binary classification
model = GPT2ForClassification(gpt2, num_labels)

In [9]:
# Define training arguments to train for only 1 epoch
training_args = transformers.TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

In [11]:
# Function to compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.from_numpy(logits), dim=-1).numpy()
    accuracy = sklearn.metrics.accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [12]:
# Initialize the Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

  0%|          | 0/313 [00:00<?, ?it/s]

{'loss': 0.0763, 'grad_norm': 1.3162119388580322, 'learning_rate': 4.840255591054313e-05, 'epoch': 0.03}
{'loss': 0.1627, 'grad_norm': 14.422348976135254, 'learning_rate': 4.680511182108626e-05, 'epoch': 0.06}
{'loss': 0.0203, 'grad_norm': 2.5744476318359375, 'learning_rate': 4.520766773162939e-05, 'epoch': 0.1}
{'loss': 0.1102, 'grad_norm': 4.313470363616943, 'learning_rate': 4.361022364217253e-05, 'epoch': 0.13}
{'loss': 0.0078, 'grad_norm': 0.8346872925758362, 'learning_rate': 4.201277955271566e-05, 'epoch': 0.16}
{'loss': 0.0208, 'grad_norm': 3.7981419563293457, 'learning_rate': 4.041533546325879e-05, 'epoch': 0.19}
{'loss': 0.0202, 'grad_norm': 0.8315554857254028, 'learning_rate': 3.8817891373801916e-05, 'epoch': 0.22}
{'loss': 0.0181, 'grad_norm': 1.277801752090454, 'learning_rate': 3.722044728434505e-05, 'epoch': 0.26}
{'loss': 0.0055, 'grad_norm': 0.40111127495765686, 'learning_rate': 3.562300319488818e-05, 'epoch': 0.29}
{'loss': 0.0086, 'grad_norm': 0.8114984035491943, 'learn

  0%|          | 0/79 [00:00<?, ?it/s]

{'eval_loss': 0.1542995572090149, 'eval_accuracy': 0.9744, 'eval_runtime': 6.2, 'eval_samples_per_second': 201.614, 'eval_steps_per_second': 12.742, 'epoch': 1.0}
{'train_runtime': 31.5851, 'train_samples_per_second': 158.302, 'train_steps_per_second': 9.91, 'train_loss': 0.061585894265113926, 'epoch': 1.0}


TrainOutput(global_step=313, training_loss=0.061585894265113926, metrics={'train_runtime': 31.5851, 'train_samples_per_second': 158.302, 'train_steps_per_second': 9.91, 'total_flos': 0.0, 'train_loss': 0.061585894265113926, 'epoch': 1.0})

In [14]:
trainer.evaluate()

  0%|          | 0/79 [00:00<?, ?it/s]

{'eval_loss': 0.1542995572090149,
 'eval_accuracy': 0.9744,
 'eval_runtime': 6.1685,
 'eval_samples_per_second': 202.643,
 'eval_steps_per_second': 12.807,
 'epoch': 1.0}

In [None]:
model()