In [1]:
import torch
from torchtext import data, datasets

SEED = 1234

torch.manual_seed(SEED)

<torch._C.Generator at 0x7f1fc8f0db70>

In [2]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [3]:
TEXT = data.Field(tokenize = 'spacy', preprocessing = generate_bigrams)
LABEL = data.LabelField(dtype = torch.float)

In [4]:
import random

In [5]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

train, valid = train.split(random_state = random.seed(SEED))

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 22.3MB/s]


In [6]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = 'glove.6B.100d',
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train)

.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                          
100%|█████████▉| 398975/400000 [00:24<00:00, 16854.56it/s]

In [7]:
BATCH_SIZE = 64

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train, valid, test),
    batch_size = BATCH_SIZE,
    device = device
)

In [9]:
import torch.nn as nn
import torch.nn.functional as F

class FastText(nn.Module):
  def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
    self.linear = nn.Linear(embedding_dim, output_dim)
  
  def forward(self, text):
    embed = self.embedding(text)
    embed = embed.permute(1, 0, 2)
    pooled = F.avg_pool2d(embed, (embed.shape[1], 1)).squeeze(1)
    return self.linear(pooled)

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [11]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2260, -0.2145,  1.1436,  ...,  0.7409,  0.9233, -1.4505],
        [ 0.5043, -0.4016,  0.3739,  ..., -0.6785, -0.6437,  0.5768],
        [ 0.2256, -1.5004, -0.4065,  ..., -0.4732,  1.0807,  1.9720]])

In [12]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [13]:
import torch.optim as optim

In [14]:
optimizer = optim.Adam(model.parameters())

In [15]:
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

100%|█████████▉| 398975/400000 [00:40<00:00, 16854.56it/s]

In [16]:
def accuracy(predictions, y):
  preds = torch.round(torch.sigmoid(predictions))
  actual = (preds == y).float()
  acc = actual.sum() / len(actual)
  return acc

In [17]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  
  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = accuracy(predictions, batch.label)
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [18]:
def eval(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.eval()
  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = accuracy(predictions, batch.label)

      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
import time

EPOCHS = 7

opt_valid_loss = float('inf')

for epoch in range(EPOCHS):

  start_time = time.time()
  train_loss, train_acc = train(model, train_iter, optimizer, criterion)
  valid_loss, valid_acc = eval(model, valid_iter, criterion)
  end_time = time.time()

  if valid_loss < opt_valid_loss:
    opt_valid_loss = valid_loss
    torch.save(model.state_dict(), 'FastText-model.pt')

  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc:.2f}%')

Epoch: 01
	Train Loss: 0.385 | Train Acc: 0.89%
	 Val. Loss: 0.377 |  Val. Acc: 0.87%
Epoch: 02
	Train Loss: 0.343 | Train Acc: 0.90%
	 Val. Loss: 0.390 |  Val. Acc: 0.87%
Epoch: 03
	Train Loss: 0.312 | Train Acc: 0.91%
	 Val. Loss: 0.407 |  Val. Acc: 0.88%
Epoch: 04
	Train Loss: 0.288 | Train Acc: 0.91%
	 Val. Loss: 0.424 |  Val. Acc: 0.88%
Epoch: 05
	Train Loss: 0.266 | Train Acc: 0.92%
	 Val. Loss: 0.436 |  Val. Acc: 0.88%
Epoch: 06
	Train Loss: 0.247 | Train Acc: 0.92%
	 Val. Loss: 0.452 |  Val. Acc: 0.89%
Epoch: 07
	Train Loss: 0.229 | Train Acc: 0.93%
	 Val. Loss: 0.466 |  Val. Acc: 0.89%
