In [10]:
!pip install transformers
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
import csv
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

topics = []
contents = []
labels = []

with open('/content/drive/MyDrive/ECE1786/project/IEL.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile)
    for i, row in enumerate(spamreader):
        if i != 0:
          topics.append(row[0])
          contents.append(row[1])
          labels.append(row[2])


print(len(topics))
print(len(contents))
print(len(labels))

4004
4004
4004


In [14]:
classes = {
    '<4': 0,
    '4.0': 1,
    '4.5': 2,
    '5.0': 3,
    '5.5': 4,
    '6.0': 5,
    '6.5': 6,
    '7.0': 7,
    '7.5': 8,
    '8.0': 9,
    '8.5': 10,
    '9.0': 11
}

labels2scores = {
    '0': 3.5,
    '1': 4,
    '2': 4.5,
    '3': 5,
    '4': 5.5,
    '5': 6,
    '6': 6.5,
    '7': 7,
    '8': 7.5,
    '9': 8,
    '10': 8.5,
    '11': 9
}

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

tokenizer.pad_token = tokenizer.eos_token
class GPT2Dataset(Dataset):

  def __init__(self, topics, contents, labels, tokenizer, max_length=512):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    self.labels = []
    self.last_idx = []
    self.targets = []

    for i in range(len(topics)):
      input = topics[i].strip() + " " + contents[i].strip()
      encodings_dict = self.tokenizer(input, truncation=True, max_length=max_length, padding="max_length")
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      if labels[i] != "<4":
        label = classes[str(float(labels[i]))]
        self.labels.append(label)
        self.targets.append(labels2scores[str(label)])
      else:
        label = classes[labels[i]]
        self.labels.append(label)
        self.targets.append(labels2scores[str(label)])
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

      eot_idx = len(encodings_dict['input_ids']) - 1
      for j, id in enumerate(encodings_dict['input_ids']):
        if id == self.tokenizer.encode("<|endoftext|>")[0]:
          eot_idx = j - 1
          break
      self.last_idx.append(j-1)
      
    print(len(self.last_idx))
    print(len(self.labels))
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):

    return self.input_ids[idx], self.attn_masks[idx], self.labels[idx], self.targets[idx], self.last_idx[idx]
batch_size = 2
dataset = GPT2Dataset(topics, contents, labels, tokenizer)

4004
4004


In [15]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

torch.manual_seed(0)

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

## make sure 
print(torch.sum(train_dataset[0][0]))

print(train_dataset[0])

tensor(5779971)
(tensor([ 4366,   661,  1975,   326,   645,   530,   815,   307,  3142,   284,
         2555,  1762,   706,   262,  2479,   286,  6135,    13,   220,   198,
         4864,    11,  1854,   910,   612,  6584,   470,   307,   257, 17385,
          319,  2479,   290,  2687,   815,   307,  3142,   284,   670,  7692,
          286,   511,  2479,    13,   220,   198, 48873,  1111,  5009,    11,
         1577,   534,   898,  4459,   290,  2291,  5981,  6096,    13,  4380,
          743,  2328,   326,  2479,   318,   257,  1994,  3210,   284, 22232,
         1762, 40460,    13,   220,  6430,   422,   616,  6650,    11,   314,
         1975,   326,  2479,   815,   407,   307,   257, 17385,   329,   530,
          284,  1620,    13,   201,   198,   201,   198,  2949,  2300,   703,
         1468,   262,   661,   389,    11,   484,   991,   460,  8676,  2405,
           13,  7129,   815,   407,   307,   262,  2318,   284, 16222,  1586,
          262, 13830,  1762, 16826,    13,  111

In [16]:
class GPT2_with_regressor(torch.nn.Module):
  def __init__(self, embd_size):
    super(GPT2_with_regressor, self).__init__()
    self.encoder = GPT2Model.from_pretrained("gpt2-medium")
    self.regressor = torch.nn.Linear(embd_size, 1)

  def forward(self, input, last_idxes):
    outputs = self.encoder(**input).last_hidden_state
    hidden_states = []
    for i in range(len(last_idxes)):
      hidden_states.append(outputs[i, last_idxes[i], :][None, :])
    hidden_states = torch.cat(hidden_states, dim=0)
    outputs = self.regressor(hidden_states)
    return outputs


In [17]:
from transformers import GPT2Model
torch.manual_seed(0)

import torch
model = GPT2_with_regressor(1024)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# regressor.to(device)

from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)

from transformers import get_scheduler

num_epochs = 13
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
print(model)

In [18]:
scores = torch.tensor([3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9])
scores = scores.to(device)

In [19]:
def custom_activation(inputs):
  inputs[inputs<3.5] = 3.5
  inputs[inputs>9] = 9
  return inputs

In [9]:
torch.manual_seed(0)
from tqdm.auto import tqdm
import evaluate
progress_bar = tqdm(range(num_training_steps))

model.encoder.config.pad_token_id = tokenizer.pad_token_id

mse = torch.nn.MSELoss()
val_distance = []
for epoch in range(num_epochs):
    model.train()
    
    total = 0
    distance = 0
    # metric = evaluate.load("accuracy")
    for input, attn_masks, labels, targets, last_idxes in train_dataloader:
        x = {
             "input_ids": input,
             "attention_mask": attn_masks,
        }
        
        batch = {k: v.to(device) for k, v in x.items()}
        outputs = model(batch, last_idxes)
        # outputs = torch.special.expit(outputs).squeeze()
        outputs = custom_activation(outputs).squeeze()
        predictions = []
        if len(outputs.size()) == 0:
          pred_idx = torch.argmin(abs(scores - outputs))
          predictions.append(pred_idx)
        else:
          for i in range(outputs.size(0)):
            pred_idx = torch.argmin(abs(scores - outputs[i]))
            predictions.append(pred_idx)
          
        predictions = torch.tensor(predictions)

        for j in range(len(predictions)):
          distance += abs(scores[predictions[j]] - scores[labels[j]])
        total += len(predictions)


        targets = targets.to(device)
        loss = mse(outputs.float(), targets.float())
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    distance = distance / total
    print(distance)
    val_distance.append(distance)
    # train_acc = metric.compute()['accuracy']
    # print(train_acc)
    # break
    # train_accs.append(train_acc)
    
    # print("Epoch: {}, Train acc: {}".format(epoch + 1, train_acc))
    torch.save(model.state_dict(), "/content/drive/MyDrive/ECE1786/project/gpt2_regression_{}.pt".format(epoch))

  0%|          | 0/20826 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


tensor(1.4451, device='cuda:0')
tensor(0.8870, device='cuda:0')
tensor(0.7053, device='cuda:0')
tensor(0.5537, device='cuda:0')
tensor(0.4555, device='cuda:0')
tensor(0.3881, device='cuda:0')
tensor(0.3333, device='cuda:0')
tensor(0.2833, device='cuda:0')
tensor(0.2573, device='cuda:0')
tensor(0.2370, device='cuda:0')
tensor(0.2134, device='cuda:0')
tensor(0.1919, device='cuda:0')
tensor(0.1817, device='cuda:0')


In [13]:
len(torch.tensor(3).size())

0

In [20]:
import evaluate
val_distance = []
for i in range(13):
  model.load_state_dict(torch.load("/content/drive/MyDrive/ECE1786/project/gpt2_regression_{}.pt".format(i)))

  model.eval()
  metric = evaluate.load("accuracy")
  model.encoder.config.pad_token_id = tokenizer.pad_token_id

  total = 0
  distance = 0
  for input, attn_masks, labels, targets, last_idxes in validation_dataloader:
      x = {
             "input_ids": input,
             "attention_mask": attn_masks,
      }
      batch = {k: v.to(device) for k, v in x.items()}
      outputs = model(batch, last_idxes)

      outputs = custom_activation(outputs).squeeze()
      predictions = []
      if len(outputs.size()) == 0:
          pred_idx = torch.argmin(abs(scores - outputs))
          predictions.append(pred_idx)
      else:
          for i in range(outputs.size(0)):
            pred_idx = torch.argmin(abs(scores - outputs[i]))
            predictions.append(pred_idx)

      predictions = torch.tensor(predictions)
      metric.add_batch(predictions=predictions, references=labels)
      for j in range(len(predictions)):
        distance += abs(scores[predictions[j]] - scores[labels[j]])
      total += len(predictions)
  distance = distance / total
  val_acc = metric.compute()['accuracy']
  print(distance, val_acc)
  val_distance.append(distance)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

tensor(0.9176, device='cuda:0') 0.18851435705368288
tensor(0.8383, device='cuda:0') 0.1972534332084894
tensor(0.7709, device='cuda:0') 0.2571785268414482
tensor(0.7122, device='cuda:0') 0.29213483146067415
tensor(0.7097, device='cuda:0') 0.28963795255930086
tensor(0.6841, device='cuda:0') 0.3083645443196005
tensor(0.6885, device='cuda:0') 0.3096129837702871
tensor(0.6610, device='cuda:0') 0.3146067415730337
tensor(0.6523, device='cuda:0') 0.32209737827715357
tensor(0.6798, device='cuda:0') 0.299625468164794
tensor(0.6461, device='cuda:0') 0.3196004993757803
tensor(0.6511, device='cuda:0') 0.31585518102372034
tensor(0.6504, device='cuda:0') 0.3121098626716604
