In [1]:
import re
import tqdm
import nltk
import torch
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
data_preprocessed = pd.read_csv('../data/processed/chunk.csv')

In [3]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(data_preprocessed, test_size=0.2, random_state=42)

In [4]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(df):
    for _, sample in df.iterrows():
        yield sample.to_list()[0]


UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab = build_vocab_from_iterator(yield_tokens(train), specials=special_symbols)
vocab.set_default_index(UNK_IDX)

In [5]:
train

Unnamed: 0,text,target
28965,"['sikh', 'modi', 'save', 'student', 'outsid', ...",1.0
4252,"['forward', 'import', 'question', 'job', 'farm...",2.0
27843,"['day', 'dream', 'indian', 'voter', 'fool', 'h...",2.0
24023,"['know', 'meant', 'fiscal', 'deficit', 'read',...",1.0
25303,"['trio', 'seem', 'one', 'lobbi', 'bjp', 'also'...",2.0
...,...,...
29802,"['nationalist', 'vote', 'modi', 'els', 'prove'...",1.0
5390,"['firstli', 'that', 'person', 'account', 'he',...",0.0
860,"['modi', 'first']",2.0
15795,"['honour', 'prime', 'minist', 'must', 'visit',...",0.0


In [13]:
from torch.utils.data import DataLoader

# torch.manual_seed(420)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(_label)
        processed_text = torch.tensor(vocab(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_dataloader = DataLoader(
    train.to_numpy(), batch_size=128, shuffle=True, collate_fn=collate_batch
)

val_dataloader = DataLoader(
    val.to_numpy(), batch_size=128, shuffle=False, collate_fn=collate_batch
)

In [14]:
import torch.nn as nn

class TextClassificationModel(nn.Module):
    def __init__(self, num_classes, vocab_len):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_len, 256)
        self.classifier = nn.Sequential(
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, text, offsets):
        embedded = self.embedding(input=text, offsets=offsets)
        logits = self.classifier(embedded)
        return logits

In [15]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    scheduler,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    for i, batch in loop:
        labels, texts, offsets = batch
        # zero the parameter gradients
        model.zero_grad()

        # forward pass
        outputs = model(texts, offsets)

        # loss calculation
        loss = loss_fn(outputs, labels)

        # backward pass
        loss.backward()

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/(i * len(labels))})
    scheduler.step()

def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):

    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            labels, texts, offsets = batch

            # forward pass
            outputs = model(texts, offsets)
            # loss calculation
            loss = loss_fn(outputs, labels)

            _, predicted = torch.max(outputs, dim=1)
            total += len(labels)
            correct += (predicted == labels).sum().item()

            val_loss += loss.item()
            loop.set_postfix({"loss": val_loss / total, "acc": correct / total})

        accuracy = correct / total
        if accuracy > best_so_far:
            best_so_far = accuracy
            torch.save(model.state_dict(), ckpt_path)

    return best_so_far

In [16]:
epochs = 10
model = TextClassificationModel(2, len(vocab)).to(device)
optimizer = torch.optim.RMSprop(model.parameters())
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
loss_fn = torch.nn.CrossEntropyLoss()

In [17]:
best = -float('inf')
for epoch in range(epochs):
    train_one_epoch(model, train_dataloader, optimizer, scheduler, loss_fn, epoch_num=epoch)
    best = val_one_epoch(model, val_dataloader, loss_fn, epoch, best_so_far=best)

Epoch 0: train:   0%|          | 0/204 [00:00<?, ?it/s]

TypeError: lookup_indices(): incompatible function arguments. The following argument types are supported:
    1. (self: torchtext._torchtext.Vocab, arg0: list) -> List[int]

Invoked with: <torchtext._torchtext.Vocab object at 0x7f03e988ccb0>, 1.0