<a href="https://colab.research.google.com/github/gfx73/PML-DL/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install allennlp
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 torchdata==0.4.1 torchtext==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/, https://download.pytorch.org/whl/cu113


In [None]:
from torchtext.datasets import IMDB

IMDB_train_iter, IMDB_test_iter = IMDB()

In [None]:
from torchtext.data.utils import get_tokenizer


tokenizer = get_tokenizer('basic_english')

In [None]:
train_tokens = []
train_labels = []
for label, text in IMDB_train_iter:
    train_labels.append(label=='pos')
    train_tokens.append(tokenizer(text))

test_tokens = []
test_labels = []
for label, text in IMDB_test_iter:
    test_labels.append(label=='pos')
    test_tokens.append(tokenizer(text))

In [None]:
train_max_len, max_len_id = 0, None
for idx, text in enumerate(train_tokens):
  if len(text) > train_max_len:
    train_max_len = len(text)
    max_len_id = idx

print(train_max_len, max_len_id)

test_max_len, max_len_id = 0, None
for idx, text in enumerate(test_tokens):
  if len(text) > test_max_len:
    test_max_len = len(text)
    max_len_id = idx

print(test_max_len, max_len_id)

max_len = max(train_max_len, test_max_len)

2752 1256
2623 8632


In [None]:
# from tqdm import tqdm


# train_embs = []
# for tokens in tqdm(train_tokens):
#   train_embs.append(batch_to_ids([tokens])[0])

# del train_tokens
# gc.collect()


# test_embs = []
# for tokens in tqdm(test_tokens):
#   test_embs.append(batch_to_ids([tokens])[0])

# del train_tokens
# gc.collect()

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class dataset(Dataset):
  def __init__(self, tokens, labels, batch_to_ids, max_len):
    self.tokens = tokens
    self.labels = torch.tensor(labels, dtype=torch.float32)
    self.length = self.labels.shape[0]
    self.batch_to_ids = batch_to_ids
    self.max_len = max_len
 
  def __getitem__(self,idx):
    emb = self.batch_to_ids([self.tokens[idx]])[0]

    if emb.shape[0] > self.max_len:
      emb = emb[:self.max_len,:]
    else:
      # print(emb.shape)
      # print(torch.zeros((self.max_len - emb.shape[0], 50)).shape)
      emb = torch.concat((emb, torch.zeros((self.max_len - emb.shape[0], 50)).type_as(emb)))
    
    return emb.to(device), self.labels[idx].to(device)

  def __len__(self):
    return self.length


trainset = dataset(train_tokens, train_labels, batch_to_ids, max_len)
testset = dataset(test_tokens, test_labels, batch_to_ids, max_len)

BATCH_SIZE = 8
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
elmo = Elmo(options_file, weight_file, dropout=0, requires_grad=False, num_output_representations=1)

In [None]:
from torch import nn
from torch.nn import functional as F

class Classifier(nn.Module):
  def __init__(self, input_shape, elmo):
    super(Classifier, self).__init__()
    self.input_shape = input_shape
    self.elmo = elmo
    self.fc1 = nn.Linear(input_shape, 1)
    
  def forward(self, input):
    x = self.elmo(input)['elmo_representations'][0]
    x = x.view((-1, self.input_shape))
    x = torch.sigmoid(self.fc1(x))
    return x

In [None]:
classifier = Classifier(1024 * max_len, elmo=elmo).to(device)

In [None]:
learning_rate = 0.01
epochs = 10

optimizer = torch.optim.Adam(classifier.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

In [None]:
from tqdm import tqdm


losses = []
accur = []

for epoch in range(epochs):
  running_loss, correct, total = 0, 0, 0
  for x_train,y_train in tqdm(trainloader):
    preds = classifier(x_train)
    loss = loss_fn(preds, y_train.reshape(-1,1))

    running_loss += loss.item()
    total += y_train.shape[0]
    correct += preds.eq(y_train).sum().item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  loss = running_loss / len(trainloader)
  acc = correct / total
  losses.append(loss)
  accur.append(acc)
  print("epoch {}\tloss : {}\t accuracy : {}".format(epoch, loss, acc))

  0%|          | 4/3125 [04:29<58:31:02, 67.50s/it]


KeyboardInterrupt: ignored