In [1]:
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import csv
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

topics = []
contents = []
labels = []

with open('/content/drive/MyDrive/ECE1786/project/IEL.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile)
    for i, row in enumerate(spamreader):
        if i != 0:
          topics.append(row[0])
          contents.append(row[1])
          labels.append(row[2])


print(len(topics))
print(len(contents))
print(len(labels))

4004
4004
4004


In [3]:
classes = {
    '<4': 0,
    '4.0': 1,
    '4.5': 2,
    '5.0': 3,
    '5.5': 4,
    '6.0': 5,
    '6.5': 6,
    '7.0': 7,
    '7.5': 8,
    '8.0': 9,
    '8.5': 10,
    '9.0': 11
}

labels2scores = {
    '0': 0,
    '1': 0.4,
    '2': 0.45,
    '3': 0.5,
    '4': 0.55,
    '5': 0.6,
    '6': 0.65,
    '7': 0.7,
    '8': 0.75,
    '9': 0.8,
    '10': 0.85,
    '11': 0.9
}

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

tokenizer.pad_token = tokenizer.eos_token
class GPT2Dataset(Dataset):

  def __init__(self, topics, contents, labels, tokenizer, max_length=512):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    self.labels = []
    self.last_idx = []
    self.targets = []

    for i in range(len(topics)):
      input = topics[i].strip() + " " + contents[i].strip()
      encodings_dict = self.tokenizer(input, truncation=True, max_length=max_length, padding="max_length")
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      if labels[i] != "<4":
        label = classes[str(float(labels[i]))]
        self.labels.append(label)
        self.targets.append(labels2scores[str(label)])
      else:
        label = classes[labels[i]]
        self.labels.append(label)
        self.targets.append(labels2scores[str(label)])
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

      eot_idx = len(encodings_dict['input_ids']) - 1
      for j, id in enumerate(encodings_dict['input_ids']):
        if id == self.tokenizer.encode("<|endoftext|>")[0]:
          eot_idx = j - 1
          break
      self.last_idx.append(j-1)
      
    print(len(self.last_idx))
    print(len(self.labels))
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):

    return self.input_ids[idx], self.attn_masks[idx], self.labels[idx], self.targets[idx], self.last_idx[idx]
batch_size = 2
dataset = GPT2Dataset(topics, contents, labels, tokenizer)

4004
4004


In [4]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

torch.manual_seed(0)

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

## make sure 
print(torch.sum(train_dataset[0][0]))

print(train_dataset[0])

tensor(5779971)
(tensor([ 4366,   661,  1975,   326,   645,   530,   815,   307,  3142,   284,
         2555,  1762,   706,   262,  2479,   286,  6135,    13,   220,   198,
         4864,    11,  1854,   910,   612,  6584,   470,   307,   257, 17385,
          319,  2479,   290,  2687,   815,   307,  3142,   284,   670,  7692,
          286,   511,  2479,    13,   220,   198, 48873,  1111,  5009,    11,
         1577,   534,   898,  4459,   290,  2291,  5981,  6096,    13,  4380,
          743,  2328,   326,  2479,   318,   257,  1994,  3210,   284, 22232,
         1762, 40460,    13,   220,  6430,   422,   616,  6650,    11,   314,
         1975,   326,  2479,   815,   407,   307,   257, 17385,   329,   530,
          284,  1620,    13,   201,   198,   201,   198,  2949,  2300,   703,
         1468,   262,   661,   389,    11,   484,   991,   460,  8676,  2405,
           13,  7129,   815,   407,   307,   262,  2318,   284, 16222,  1586,
          262, 13830,  1762, 16826,    13,  111

In [5]:
class GPT2_with_regressor(torch.nn.Module):
  def __init__(self, embd_size):
    super(GPT2_with_regressor, self).__init__()
    self.encoder = GPT2Model.from_pretrained("gpt2-medium")
    self.regressor = torch.nn.Linear(embd_size, 1)

  def forward(self, input, last_idxes):
    outputs = self.encoder(**input).last_hidden_state
    hidden_states = []
    for i in range(len(last_idxes)):
      hidden_states.append(outputs[i, last_idxes[i], :][None, :])
    hidden_states = torch.cat(hidden_states, dim=0)
    outputs = self.regressor(hidden_states)
    return outputs


In [6]:
from transformers import GPT2Model
torch.manual_seed(0)

import torch
model = GPT2_with_regressor(1024)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# regressor.to(device)

from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)

from transformers import get_scheduler

num_epochs = 13
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
print(model)

In [7]:
scores = torch.tensor([0, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9])
scores = scores.to(device)

In [8]:
torch.manual_seed(0)
from tqdm.auto import tqdm
import evaluate
progress_bar = tqdm(range(num_training_steps))

model.encoder.config.pad_token_id = tokenizer.pad_token_id

mse = torch.nn.MSELoss()
train_accs = []
val_accs = []
for epoch in range(num_epochs):
    model.train()
    metric = evaluate.load("accuracy")
    for input, attn_masks, labels, targets, last_idxes in train_dataloader:
        x = {
             "input_ids": input,
             "attention_mask": attn_masks,
        }
        
        batch = {k: v.to(device) for k, v in x.items()}
        outputs = model(batch, last_idxes)
        outputs = torch.special.expit(outputs)
        predictions = []
        for i in range(len(outputs)):
          pred_idx = torch.argmin(abs(scores - outputs[i]))
          predictions.append(pred_idx)
        predictions = torch.tensor(predictions)
        metric.add_batch(predictions=predictions, references=labels)
        
        # print(outputs, predictions, labels, targets)

        targets = targets.to(device)
        loss = mse(outputs.float(), targets.float())
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        # break
    # break
    train_acc = metric.compute()['accuracy']
    print(train_acc)
    # break
    train_accs.append(train_acc)
    
    print("Epoch: {}, Train acc: {}".format(epoch + 1, train_acc))
    torch.save(model.state_dict(), "/content/drive/MyDrive/ECE1786/project/gpt2_regression_{}.pt".format(epoch))

  0%|          | 0/20826 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.09678426475179519
Epoch: 1, Train acc: 0.09678426475179519


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.12987823915079613
Epoch: 2, Train acc: 0.12987823915079613


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.12519512956603185
Epoch: 3, Train acc: 0.12519512956603185


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.13612238526381518
Epoch: 4, Train acc: 0.13612238526381518


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.14985950671245707
Epoch: 5, Train acc: 0.14985950671245707


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.14392756790508898
Epoch: 6, Train acc: 0.14392756790508898


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.14455198251639087
Epoch: 7, Train acc: 0.14455198251639087


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.14517639712769279
Epoch: 8, Train acc: 0.14517639712769279


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.14861067748985327
Epoch: 9, Train acc: 0.14861067748985327


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


0.14798626287855135
Epoch: 10, Train acc: 0.14798626287855135


  return F.mse_loss(input, target, reduction=self.reduction)


KeyboardInterrupt: ignored