<a href="https://colab.research.google.com/github/gfx73/PML-DL/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install allennlp



In [2]:
!pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 torchdata==0.4.1 torchtext==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113


In [3]:
import sys
CLASSIFIER_PRETRAINED = False
MAX_SEQ_LEN = 1024

# PRECOMPUTE_TOK_IDS = False
# TOK_IDS_PRECOMPUTED = True
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
  PATH_TO_SAVE_ELMO_CLASSIFIER = '/content/drive/MyDrive/PML&DL/Assignment2/elmo_classifier.pt'
else:
  PATH_TO_SAVE_ELMO_CLASSIFIER = 'elmo_classifier.pt'

In [4]:
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')

In [5]:
from torchtext.datasets import IMDB

IMDB_train_iter, IMDB_test_iter = IMDB()

In [6]:
from tqdm import tqdm
from torchtext.data.utils import get_tokenizer
import gc
import random


random.seed(11)
tokenizer = get_tokenizer('basic_english')

def get_labels_and_text(datasplit):
  tokens, labels = [], []
  for label, text in tqdm(datasplit):
    tokens.append(tokenizer(text))
    labels.append(label=='pos')
  return tokens, labels

train_tokens, train_labels = get_labels_and_text(IMDB_train_iter)
test_tokens, test_labels = get_labels_and_text(IMDB_test_iter)

sample_tokens_and_labels = lambda tokens, labels: zip(*random.sample(list(zip(tokens, labels)), len(labels)))

train_tokens, train_labels = sample_tokens_and_labels(train_tokens, train_labels)
test_tokens, test_labels = sample_tokens_and_labels(test_tokens, test_labels)

del IMDB_train_iter
del IMDB_test_iter
gc.collect()

25000it [00:04, 6043.91it/s]
25000it [00:04, 6173.31it/s]


0

In [7]:
# def get_max_len(all_tokens):
#   max_len = 0
#   for tokens_split in all_tokens:
#     for tokens in tokens_split:
#       max_len = max(max_len, len(tokens))
#   return max_len
#
#
# MAX_SEQ_LEN = get_max_len((train_tokens, test_tokens))
# print(MAX_SEQ_LEN)

In [8]:
from torch.utils.data import Dataset, DataLoader, Subset
from allennlp.modules.elmo import Elmo, batch_to_ids
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

# if PRECOMPUTE_TOK_IDS:
#   def save_tok_ids(all_tokens, filename_prefix, shard_size=5000):
#     all_tok_ids = []
#     for idx, tokens in tqdm(enumerate(all_tokens), total=len(all_tokens)):
#       tok_ids = (batch_to_ids([tokens])[0])
#       all_tok_ids.append(tok_ids)
#
#       if (idx + 1) % shard_size == 0 or (idx + 1) == len(all_tokens):
#         torch.save(all_tok_ids, f"{filename_prefix}{idx // shard_size}.pt")
#         del all_tok_ids
#         gc.collect()
#         all_tok_ids = []
#     return all_tok_ids
#
#   train_filename_prefix = 'train_tok_ids'
#   test_filename_prefix = 'test_tok_ids'
#
#   if not TOK_IDS_PRECOMPUTED:
#     train_tok_ids = save_tok_ids(train_tokens, train_filename_prefix)
#     del train_tokens
#     gc.collect()
#
#     test_tok_ids = save_tok_ids(test_tokens, test_filename_prefix)
#     del test_tokens
#     gc.collect()
#
#
#   class dataset(Dataset):
#     def __init__(self, labels, filename_prefix, max_len, shard_size=5000):
#       self.labels = torch.tensor(labels, dtype=torch.float32)
#       self.length = self.labels.shape[0]
#       self.filename_prefix = filename_prefix
#       self.max_len = max_len
#       self.shard_size=shard_size
#       self.cur_shard = None
#       self.cur_shard_idx = None
#
#     def __getitem__(self, idx):
#       tok_ids = self.__get_tok_ids__(idx).to(device)
#       if tok_ids.shape[0] > self.max_len:
#         tok_ids = tok_ids[:self.max_len,:]
#       else:
#         zeros = torch.zeros((self.max_len - tok_ids.shape[0], 50), dtype=tok_ids.dtype, device=device)
#         tok_ids = torch.concat((tok_ids, zeros))
#
#       return tok_ids.to(device), self.labels[idx].to(device)
#
#     def __get_tok_ids__(self, idx):
#       self.__reload_shard__(idx)
#       return self.cur_shard[idx % self.shard_size]
#
#     def __reload_shard__(self, idx):
#       shard_idx = idx // self.shard_size
#       if self.cur_shard_idx == shard_idx:
#         return
#
#       del self.cur_shard
#       gc.collect()
#       self.cur_shard = torch.load(f"{self.filename_prefix}{shard_idx}.pt")
#       self.cur_shard_idx = shard_idx
#
#     def __len__(self):
#       return self.length
#
#
#   trainset = dataset(train_labels, train_filename_prefix, MAX_SEQ_LEN)
#   testset = dataset(test_labels, test_filename_prefix, MAX_SEQ_LEN)
# else:
class dataset(Dataset):
  def __init__(self, tokens, labels, batch_to_ids, max_len):
    self.tokens = tokens
    self.labels = torch.tensor(labels, dtype=torch.float32)
    self.length = self.labels.shape[0]
    self.batch_to_ids = batch_to_ids
    self.max_len = max_len

  def __getitem__(self, idx):
    tok_ids = self.batch_to_ids([self.tokens[idx]])[0]

    if tok_ids.shape[0] > self.max_len:
      tok_ids = tok_ids[:self.max_len,:]
    else:
      zeros = torch.zeros((self.max_len - tok_ids.shape[0], 50), dtype=tok_ids.dtype)
      tok_ids = torch.concat((tok_ids, zeros))

    return tok_ids.to(device), self.labels[idx].to(device)

  def __len__(self):
    return self.length


trainset = dataset(train_tokens, train_labels, batch_to_ids, MAX_SEQ_LEN)
testset = dataset(test_tokens, test_labels, batch_to_ids, MAX_SEQ_LEN)


valset_size = int(len(testset) * 0.02)
testset_size = len(testset) - valset_size
valset = Subset(testset, range(valset_size))
testset = Subset(testset, range(valset_size, valset_size + testset_size))

BATCH_SIZE = 20
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=True)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
from torch import nn
from torch.nn import functional as F

class Classifier(nn.Module):
  def __init__(self, input_shape, elmo):
    super(Classifier, self).__init__()
    self.input_shape = input_shape
    self.elmo = elmo
    self.fc1 = nn.Linear(input_shape, 1)
    
  def forward(self, input):
    embs = self.elmo(input)['elmo_representations'][0]
    reshaped = embs.view((-1, self.input_shape))
    x = torch.sigmoid(self.fc1(reshaped))
    return x


if CLASSIFIER_PRETRAINED:
  classifier = torch.load(PATH_TO_SAVE_ELMO_CLASSIFIER)
else:
  if IN_COLAB:
    options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
  else:
    options_file = "options.json"
    weight_file = "weights.hdf5"
  elmo = Elmo(options_file, weight_file, dropout=0, requires_grad=False, num_output_representations=1).to(device)
  classifier = Classifier(1024 * MAX_SEQ_LEN, elmo=elmo).to(device)

In [10]:
sum(p.numel() for p in classifier.elmo.parameters() if p.requires_grad)

4

In [11]:
learning_rate = 0.001
epochs = 10
l2_penalty = 0.001

optimizer = torch.optim.RMSprop(classifier.parameters(), lr=learning_rate, weight_decay=l2_penalty)
loss_fn = F.binary_cross_entropy_with_logits

In [12]:
!pip install torchmetrics



In [13]:
import torchmetrics

def eval_model(model, data, loss_fn):
  acc_metric = torchmetrics.Accuracy().to(device)
  prec_metric = torchmetrics.Precision().to(device)
  rec_metric = torchmetrics.Recall().to(device)
  f1_metric = torchmetrics.F1Score().to(device)
  running_loss = 0
  for x, y in tqdm(data):
    with torch.no_grad():
      y = y.reshape(-1, 1)
      with torch.autocast(device_type=device, dtype=torch.float16):
        preds = model(x)
        loss = loss_fn(preds, y)


      running_loss += loss.item()
      
      y = y.type(torch.int8)
      acc_metric(preds.round(), y)
      prec_metric(preds.round(), y)
      rec_metric(preds.round(), y)
      f1_metric(preds.round(), y)

      # print(y)
      # print(preds.round())
      # print(acc_metric.compute())

  loss = running_loss / len(data)
  acc = acc_metric.compute().item()
  prec = prec_metric.compute().item()
  rec = rec_metric.compute().item()
  f1 = f1_metric.compute().item()
  return loss, acc, prec, rec, f1

# loss, acc, prec, rec, f1 = eval_model(classifier, valloader, loss_fn)
# print("Initial metrics\tval loss: {}\tval acc: {}\tval prec: {}\tval rec: {}\tval f1: {}".format(loss, acc, prec, rec, f1))

In [None]:
torch.cuda.empty_cache()
# torch.autograd.set_detect_anomaly(True)
train_losses = []
train_accs = []
val_metrics = []

best_val_loss = 1e+8
for epoch in range(epochs):
  running_loss, correct, total = 0, 0, 0
  for iteration, (x_train ,y_train) in tqdm(enumerate(trainloader), total=len(trainloader)):
    optimizer.zero_grad()
    y_train = y_train.reshape(-1,1)
    with torch.autocast(device_type=device, dtype=torch.float16):
      preds = classifier(x_train)
      loss = loss_fn(preds, y_train)

    running_loss += loss.item()
    total += y_train.shape[0]
    correct += preds.round().eq(y_train).sum().item()

    loss.backward()
    optimizer.step()

    if iteration % 50 == 0:
      _loss = running_loss / (iteration + 1)
      acc = correct / total
      print("epoch: {}\titeration: {}\tloss: {}\tthis iteration loss: {}\taccuracy: {}".format(epoch, iteration, _loss, loss, acc))


  loss = running_loss / len(trainloader)
  acc = correct / total
  train_losses.append(loss)
  train_accs.append(acc)
  print("epoch {}\ttrain loss : {}\ttrain accuracy : {}".format(epoch, loss, acc))

  loss, acc, prec, rec, f1 = eval_model(classifier, valloader, loss_fn)
  val_metrics.append([loss, acc, prec, rec, f1])
  print("epoch: {}\tval loss: {}\tval acc: {}\tval prec: {}\tval rec: {}\tval f1: {}".format(epoch, loss, acc, prec, rec, f1))
  if best_val_loss > loss:
    torch.save(classifier, PATH_TO_SAVE_ELMO_CLASSIFIER)
    best_val_loss = loss
  if not IN_COLAB:
    torch.save(classifier, f'classifier{epoch}.pt')

  0%|          | 1/1250 [00:11<4:01:55, 11.62s/it]

epoch: 0	iteration: 0	loss: 0.8206911087036133	this iteration loss: 0.8206911087036133	accuracy: 0.45


  3%|▎         | 37/1250 [06:22<3:07:22,  9.27s/it]

In [None]:
if IN_COLAB:
  !kill $(ps aux | awk '{print $2}')