**Chen Zhao**

**1008698358**

**czchen.zhao@mail.utoronto.ca**

In [6]:
# pip install transformers datasets evaluate

**Import packages**

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import datasets
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader
from transformers import get_scheduler
from tqdm.auto import tqdm

**User Cuda**

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

**Process and Split data**

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForSequenceClassification.from_pretrained("distilgpt2", num_labels=2)
model.to(device)

# set paddings
model.config.pad_token_id = model.config.eos_token_id

if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token


training_args = TrainingArguments(output_dir="test_trainer", overwrite_output_dir=True, per_device_train_batch_size=64, evaluation_strategy="epoch")


Some weights of the model checkpoint at distilgpt2 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using pad_token, but it is not set yet.


In [4]:
# use the same data from part 3
dataset = datasets.load_dataset('glue', 'sst2')

def tokenize_function(examples):
  return tokenizer(examples["sentence"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'attention_mask', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_train_dataset = tokenized_datasets["train"].select(range(960))
small_eval_dataset = tokenized_datasets["train"].select(range(960, 1200))

Downloading readme:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Found cached dataset glue (C:/Users/czhao/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [6]:
# set metric to accuracy
metric = evaluate.load("accuracy")

# def compute_metrics(eval_pred):
#   logits, labels = eval_pred
#   predictions = np.argmax(logits, axis=-1)
#   return metric.compute(predictions=predictions, references=labels)

**Set Torch Training Requirements**

In [7]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [9]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/360 [00:00<?, ?it/s]

**Get Accuracy On Validation Data**

In [10]:
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.825}

In [11]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_eval_dataset,
#     compute_metrics=compute_metrics,
# )

In [12]:
# trainer.train()
