<a href="https://colab.research.google.com/github/gfx73/PML-DL/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install allennlp



In [2]:
!pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 torchdata==0.4.1 torchtext==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113


In [3]:
import sys
IN_COLAB = 'google.colab' in sys.modules
TOK_IDS_PRECOMPUTED = False

In [None]:
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')

In [4]:
from torchtext.datasets import IMDB

IMDB_train_iter, IMDB_test_iter = IMDB()

In [5]:
from tqdm import tqdm
from torchtext.data.utils import get_tokenizer
import gc


tokenizer = get_tokenizer('basic_english')

train_tokens = []
train_labels = []
for label, text in tqdm(IMDB_train_iter):
    train_labels.append(label=='pos')
    train_tokens.append(tokenizer(text))

test_tokens = []
test_labels = []
for label, text in tqdm(IMDB_test_iter):
    test_labels.append(label=='pos')
    test_tokens.append(tokenizer(text))

VAL_SIZE = 500
val_tokens = test_tokens[:VAL_SIZE]
val_labels = test_labels[:VAL_SIZE]
test_tokens = test_tokens[VAL_SIZE:]
test_labels = test_labels[VAL_SIZE:]
del IMDB_train_iter
del IMDB_test_iter
gc.collect()

25000it [00:03, 6284.66it/s]
25000it [00:03, 6447.88it/s]


0

In [6]:
def get_max_len(all_tokens):
  max_len = 0
  for tokens_split in all_tokens:
    for tokens in tokens_split:
      max_len = max(max_len, len(tokens))
  return max_len


max_len = get_max_len((train_tokens, val_tokens, test_tokens))
print(max_len)

2752


In [7]:
from allennlp.modules.elmo import Elmo, batch_to_ids
import torch


def save_tok_ids(all_tokens, max_len, filename_prefix, shard_size=5000):
  all_tok_ids = []
  for idx, tokens in tqdm(enumerate(all_tokens), total=len(all_tokens)):
    tok_ids = (batch_to_ids([tokens])[0])
    if tok_ids.shape[0] > max_len:
      tok_ids = tok_ids[:max_len,:]
    else:
      tok_ids = torch.concat((tok_ids, torch.zeros((max_len - tok_ids.shape[0], 50)).type_as(tok_ids)))
    all_tok_ids.append(tok_ids)

    if (idx + 1) % shard_size == 0 or (idx + 1) == len(all_tokens):
      torch.save(all_tok_ids, f"{filename_prefix}{idx // shard_size}.pt")
      del all_tok_ids
      gc.collect()
      all_tok_ids = []
  return all_tok_ids

train_filename_prefix = 'train_tok_ids'
test_filename_prefix = 'test_tok_ids'
val_filename_prefix = 'val_tok_ids'

if not TOK_IDS_PRECOMPUTED:
  train_tok_ids = save_tok_ids(train_tokens, max_len, train_filename_prefix)
  del train_tokens
  gc.collect()

  test_tok_ids = save_tok_ids(test_tokens, max_len, test_filename_prefix)
  del test_tokens
  gc.collect()

  val_tok_ids = save_tok_ids(val_tokens, max_len, val_filename_prefix)
  del val_tokens
  gc.collect()


100%|██████████| 25000/25000 [07:35<00:00, 54.93it/s]  
100%|██████████| 24500/24500 [07:22<00:00, 55.34it/s]  
100%|██████████| 500/500 [00:02<00:00, 213.46it/s]


In [8]:
from torch.utils.data import Dataset, DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"

class dataset(Dataset):
  def __init__(self, labels, filename_prefix, shard_size=5000):
    self.labels = torch.tensor(labels, dtype=torch.float32)
    self.length = self.labels.shape[0]
    self.filename_prefix = filename_prefix
    self.shard_size=shard_size
    self.cur_shard = None
    self.cur_shard_idx = None

  def __getitem__(self, idx):
    tok_ids = self.__get_tok_ids__(idx)
    return tok_ids.to(device), self.labels[idx].to(device)

  def __get_tok_ids__(self, idx):
    self.__reload_shard__(idx)
    return self.cur_shard[idx % self.shard_size]

  def __reload_shard__(self, idx):
    shard_idx = idx // self.shard_size
    if self.cur_shard_idx == shard_idx:
      return
    self.cur_shard = torch.load(f"{self.filename_prefix}{shard_idx}.pt")
    self.cur_shard_idx = shard_idx

  def __len__(self):
    return self.length

train_filename_prefix = 'train_tok_ids'
test_filename_prefix = 'test_tok_ids'
val_filename_prefix = 'val_tok_ids'
trainset = dataset(train_labels, train_filename_prefix)
valset = dataset(val_labels, val_filename_prefix)
testset = dataset(test_labels, test_filename_prefix)

BATCH_SIZE = 7
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

In [9]:
# from allennlp.modules.elmo import Elmo, batch_to_ids
# from torch.utils.data import Dataset, DataLoader
# device = "cuda" if torch.cuda.is_available() else "cpu"
#
# class dataset(Dataset):
#   def __init__(self, tokens, labels, batch_to_ids, max_len):
#     self.tokens = tokens
#     self.labels = torch.tensor(labels, dtype=torch.float32)
#     self.length = self.labels.shape[0]
#     self.batch_to_ids = batch_to_ids
#     self.max_len = max_len
#
#   def __getitem__(self, idx):
#     tok_ids = self.batch_to_ids([self.tokens[idx]])[0]
#
#     if tok_ids.shape[0] > self.max_len:
#       tok_ids = tok_ids[:self.max_len,:]
#     else:
#       tok_ids = torch.concat((tok_ids, torch.zeros((self.max_len - tok_ids.shape[0], 50)).type_as(tok_ids)))
#
#     return tok_ids.to(device), self.labels[idx].to(device)
#
#   def __len__(self):
#     return self.length
#
#
# trainset = dataset(train_tokens, train_labels, batch_to_ids, max_len)
# testset = dataset(test_tokens, test_labels, batch_to_ids, max_len)
#
# BATCH_SIZE = 7
# trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)
# testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
from torch import nn
from torch.nn import functional as F

class Classifier(nn.Module):
  def __init__(self, input_shape, elmo):
    super(Classifier, self).__init__()
    self.input_shape = input_shape
    self.elmo = elmo
    self.fc1 = nn.Linear(input_shape, 1)
    
  def forward(self, input):
    embs = self.elmo(input)['elmo_representations'][0]
    reshaped = embs.view((-1, self.input_shape))
    x = torch.sigmoid(self.fc1(reshaped))
    return x


if IN_COLAB:
  options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
  weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
  elmo = Elmo(options_file, weight_file, dropout=0, requires_grad=False, num_output_representations=1)
else:
  options_file = "options.json"
  weight_file = "weights.hdf5"
  elmo = Elmo(options_file, weight_file, dropout=0, requires_grad=False, num_output_representations=1).to(device)
  classifier = Classifier(1024 * max_len, elmo=elmo).to(device)

In [11]:
sum(p.numel() for p in classifier.elmo.parameters() if p.requires_grad)

4

In [12]:
learning_rate = 0.001
epochs = 10
l2_penalty = 0.001

optimizer = torch.optim.RMSprop(classifier.parameters(), lr=learning_rate, weight_decay=l2_penalty)
loss_fn = F.binary_cross_entropy_with_logits

In [13]:
!pip install torchmetrics



In [14]:
import torchmetrics

def eval_model(model, data, loss_fn):
  acc_metric = torchmetrics.Accuracy().to(device)
  prec_metric = torchmetrics.Precision().to(device)
  rec_metric = torchmetrics.Recall().to(device)
  f1_metric = torchmetrics.F1Score().to(device)
  running_loss = 0
  for x, y in tqdm(data):
    with torch.no_grad():
      y = y.reshape(-1, 1)
      with torch.autocast(device_type=device, dtype=torch.float16):
        preds = model(x)
        loss = loss_fn(preds, y)


      running_loss += loss.item()
      y = y.type(torch.int8)
      acc_metric(preds.round(), y)
      prec_metric(preds.round(), y)
      rec_metric(preds.round(), y)
      f1_metric(preds.round(), y)

  loss = running_loss / len(data)
  acc = acc_metric.compute().item()
  prec = prec_metric.compute().item()
  rec = rec_metric.compute().item()
  f1 = f1_metric.compute().item()
  return loss, acc, prec, rec, f1

loss, acc, prec, rec, f1 = eval_model(classifier, valloader, loss_fn)

100%|██████████| 72/72 [08:57<00:00,  7.46s/it]


In [15]:
print(loss, acc, prec, rec, f1)

0.9674400877621439 tensor(0.7240, device='cuda:0') tensor(0., device='cuda:0') tensor(0., device='cuda:0') tensor(0., device='cuda:0')


In [None]:
torch.cuda.empty_cache()
torch.autograd.set_detect_anomaly(True)
train_losses = []
train_accs = []

best_val_loss = 1e+8
for epoch in range(epochs):
  running_loss, correct, total = 0, 0, 0
  for iteration, (x_train ,y_train) in tqdm(enumerate(trainloader), total=len(trainloader)):
    optimizer.zero_grad()
    y_train = y_train.reshape(-1,1)
    with torch.autocast(device_type=device, dtype=torch.float16):
      preds = classifier(x_train)
      loss = loss_fn(preds, y_train)

    running_loss += loss.item()
    total += y_train.shape[0]
    correct += preds.round().eq(y_train).sum().item()

    loss.backward()
    optimizer.step()

    if iteration % 50 == 0:
      _loss = running_loss / (iteration + 1)
      acc = correct / total
      print("epoch: {}\titeration: {}\tloss: {}\tthis iteration loss: {}\taccuracy: {}".format(epoch, iteration, _loss, loss, acc))


  loss = running_loss / len(trainloader)
  acc = correct / total
  train_losses.append(loss)
  train_accs.append(acc)
  print("epoch {}\ttrain loss : {}\ttrain accuracy : {}".format(epoch, loss, acc))

  loss, acc, prec, rec, f1 = eval_model(classifier, valloader, loss_fn)
  print("epoch: {}\tval loss: {}\tval acc: {}\tval prec: {}\tval rec: {}\tval f1: {}".format(epoch, loss, acc, prec, rec, f1))
  if IN_COLAB:
    if best_val_loss > loss:
      torch.save(classifier, '/content/drive/MyDrive/PML&DL/Assignment2/elmo_classifier.pt')
      best_val_loss = loss
  else:
    torch.save(classifier, f'classifier{epoch}.pt')

In [None]:
if IN_COLAB:
  !kill $(ps aux | awk '{print $2}')