In [1]:
import torch
import transformers
import datasets
import sklearn

In [2]:
# Load GPT-2 tokenizer and model
tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2', device_map='auto')
# Add a padding token to GPT-2 tokenizer (since it doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

In [3]:
# Load a small subset of the IMDb dataset for binary sentiment classification
dataset = datasets.load_dataset('imdb', split='train[:50%]')
dataset = dataset.train_test_split(test_size=0.2)

In [4]:
# Annotate with whether a text contains the word 'alien'
def set_word_label(example):
    if 'alien' in example['text'].lower():
        example['label'] = 1
    else:
        example['label'] = 0
    return example

dataset = dataset.map(set_word_label)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [5]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'], padding='max_length', truncation=True, max_length=128
    )

# Apply the tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets.set_format('torch')

# Split into train and evaluation datasets
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [6]:
# Define a custom model with GPT-2 as feature extractor and a linear classifier on top
class GPT2ForClassification(torch.nn.Module):
    def __init__(self, gpt2, num_labels):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = gpt2
        # Freeze GPT-2 parameters
        for param in self.gpt2.parameters():
            param.requires_grad = False
        # Linear classifier
        self.classifier = torch.nn.Linear(self.gpt2.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        # Get hidden states from GPT-2
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        # Use the hidden state of the last token for classification
        last_token_indices = attention_mask.sum(dim=1) - 1
        pooled_output = outputs.last_hidden_state[torch.arange(input_ids.size(0)), last_token_indices]
        logits = self.classifier(pooled_output)
        loss = None
        if labels is not None:
            # Compute loss
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))
        return {'loss': loss, 'logits': logits}

In [7]:
# Load GPT-2
gpt2 = transformers.GPT2Model.from_pretrained('gpt2', device_map='auto', torch_dtype='auto')

# Initialize the model
num_labels = 2  # Binary classification
model = GPT2ForClassification(gpt2, num_labels)

In [8]:
# Define training arguments to train for only 1 epoch
training_args = transformers.TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

In [9]:
# Function to compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.from_numpy(logits), dim=-1).numpy()
    accuracy = sklearn.metrics.accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [10]:
# Initialize the Trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

  0%|          | 0/625 [00:00<?, ?it/s]

{'loss': 4.7486, 'grad_norm': 359.2467956542969, 'learning_rate': 4.92e-05, 'epoch': 0.02}
{'loss': 3.9351, 'grad_norm': 326.4512939453125, 'learning_rate': 4.8400000000000004e-05, 'epoch': 0.03}
{'loss': 3.4848, 'grad_norm': 370.4857482910156, 'learning_rate': 4.76e-05, 'epoch': 0.05}
{'loss': 3.1725, 'grad_norm': 352.8597717285156, 'learning_rate': 4.6800000000000006e-05, 'epoch': 0.06}
{'loss': 2.5359, 'grad_norm': 293.3836669921875, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.08}
{'loss': 2.2321, 'grad_norm': 272.8007507324219, 'learning_rate': 4.52e-05, 'epoch': 0.1}
{'loss': 1.7606, 'grad_norm': 291.4390869140625, 'learning_rate': 4.44e-05, 'epoch': 0.11}
{'loss': 1.6087, 'grad_norm': 269.3735046386719, 'learning_rate': 4.36e-05, 'epoch': 0.13}
{'loss': 1.2773, 'grad_norm': 222.04600524902344, 'learning_rate': 4.2800000000000004e-05, 'epoch': 0.14}
{'loss': 0.9613, 'grad_norm': 178.4942626953125, 'learning_rate': 4.2e-05, 'epoch': 0.16}
{'loss': 0.7262, 'grad_norm': 165.37

  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.1842571198940277, 'eval_accuracy': 0.9596, 'eval_runtime': 12.3402, 'eval_samples_per_second': 202.591, 'eval_steps_per_second': 12.723, 'epoch': 1.0}
{'train_runtime': 62.4355, 'train_samples_per_second': 160.165, 'train_steps_per_second': 10.01, 'train_loss': 0.5643834292411805, 'epoch': 1.0}


TrainOutput(global_step=625, training_loss=0.5643834292411805, metrics={'train_runtime': 62.4355, 'train_samples_per_second': 160.165, 'train_steps_per_second': 10.01, 'total_flos': 0.0, 'train_loss': 0.5643834292411805, 'epoch': 1.0})

In [12]:
trainer.evaluate()

  0%|          | 0/157 [00:00<?, ?it/s]

{'eval_loss': 0.1842571198940277,
 'eval_accuracy': 0.9596,
 'eval_runtime': 12.2539,
 'eval_samples_per_second': 204.016,
 'eval_steps_per_second': 12.812,
 'epoch': 1.0}

In [13]:
# Use the trainer to make predictions on the evaluation dataset
predictions_output = trainer.predict(eval_dataset)
probabilities = torch.nn.functional.softmax(torch.tensor(predictions_output.predictions), dim=-1)

  0%|          | 0/157 [00:00<?, ?it/s]

In [33]:
predictions_output.predictions.argmax(axis=-1).nonzero()

(array([  37,   70,  112,  285,  297,  339,  346,  367,  381,  488,  498,
         587,  599,  820,  931, 1036, 1065, 1149, 1179, 1296, 1383, 1494,
        1549, 1558, 1589, 1744, 1805, 1867, 1914, 1922, 1939, 1970, 1980,
        2047, 2074, 2111, 2150, 2227, 2228, 2245, 2476]),)

In [35]:
predictions_output.label_ids.nonzero()

(array([   5,    9,  172,  187,  194,  235,  243,  260,  275,  379,  412,
         424,  444,  445,  473,  485,  507,  527,  539,  705,  711,  802,
         850,  905,  913,  942, 1047, 1048, 1078, 1102, 1104, 1265, 1353,
        1401, 1407, 1443, 1507, 1547, 1679, 1874, 1910, 1924, 1928, 1941,
        1988, 1998, 2002, 2023, 2046, 2059, 2080, 2094, 2151, 2187, 2189,
        2203, 2377, 2380, 2411, 2447]),)

In [22]:
torch.tensor(dataset['test']['label']).sum()

tensor(60)

In [16]:
top_idxs = reversed(probabilities[:, 1].sort().indices)[:10]
for i in top_idxs:
    print(f"Predicted probability: {probabilities[i,1]}")
    print(f"True label: {predictions_output.label_ids[i]}")
    print(f"Text: {dataset['test'][i.item()]['text']}")
    print("-" * 80)

Predicted probability: 0.9996633529663086
True label: 0
Text: I loved "Anchorman; The Legend of Ron Burgundy" and hoped this would be just as funny, but alas, it wasn't. Some bits are excellent though. I thought the sports guy, Champ Kind, professing his love for Ron Burgundy in the car filled with the other members of the news team was hilarious. Everyone is ignoring him and he just gets louder and louder and finally kisses Burgundy which doesn't get acknowledged either. But on the whole the story doesn't gel. It's a noble attempt, however, to salvage the unused bits from the first movie, including an entire plot about some pretty benign would-be domestic terrorists called "The Alarm Clock." Maya Rudolph of Saturday Night Live is one of the members and has a couple of funny lines, but basically this unused plot line has good reason to be unused in the first movie. The extras on this disk are pretty good, with the best two being the filmed rehearsals featuring lots of improv comedy, an