In [1]:
import functools

from enum import Enum

import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import spacy

from sklearn import preprocessing

from transformers import BertTokenizer

import pandas as pd

from tqdm.notebook import tqdm

from torch.utils.tensorboard import SummaryWriter

In [2]:
class TokenizerType(Enum):
    SPACY = 1
    BERT = 2

In [3]:
PAD_TOKEN = "<pad>"
PADDING_MODE = "same"

In [4]:
def tokenize_and_filter(df: pd.DataFrame):
    filtered_reviews = []
    tokenizer = spacy.load("en_core_web_sm")

    # tokenize reviews and filter stopwords
    for review in df["reviews"]:
        # create document object with lingustic annotations
        document = tokenizer(review)

        # go through tokens and exclude stopwords
        document = [token.lemma_ for token in document if (len(token.lemma_) > 1) and (token.lemma_.isalnum()) and (not token.is_stop)]
        # document = ' '.join(document)
        filtered_reviews.append(document)

    return filtered_reviews

In [5]:
def numericalize_data(all_tokens, vocab):
    ids = []
    for token_set in all_tokens:
        ids.append([vocab[token] for token in token_set])
    
    return ids

In [6]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, filepaths: list[str], tokenizer: TokenizerType = TokenizerType.SPACY):
        df = pd.concat(map(pd.read_csv, filepaths))

        self.le = preprocessing.LabelEncoder()
        self.labels = self.le.fit_transform(df["sentiment"]) 
        
        if tokenizer == TokenizerType.SPACY:
            self.reviews = tokenize_and_filter(df)
            self.vocab = torchtext.vocab.build_vocab_from_iterator(self.reviews, specials=[PAD_TOKEN])
            self.ids = numericalize_data(self.reviews, self.vocab)
        else:
            pass

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.LongTensor(self.ids[idx]), torch.tensor(self.labels[idx], dtype=torch.long)

In [7]:
all_data = AmazonReviewsDataset(["../data-scraper/dishwashing tablet_reviews.csv", "../data-scraper/mug_reviews.csv", "../data-scraper/washing powder_reviews.csv"])
pad_index = all_data.vocab["<pad>"]

In [8]:
def pad_tensors(batch, pad_index):
    ids = []
    labels = []

    for batch_ids, batch_labels in batch:
        ids.append(batch_ids)
        labels.append(batch_labels)

    ids = nn.utils.rnn.pad_sequence(ids, padding_value=pad_index, batch_first=True)
    labels = torch.stack(labels)

    return ids, labels

In [9]:
train_set, val_set, test_set = torch.utils.data.random_split(all_data, [0.7, 0.15, 0.15])

In [10]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, filter_size: int):
        super().__init__()

        self.linear_stack = nn.Sequential(
            nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
                      padding=PADDING_MODE, kernel_size=filter_size, bias=False),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear_stack(x)
        return x.max(dim=-1).values

In [11]:
class SentimentCNN(nn.Module):
    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, dropout_rate: float, output_dim: int, feature_maps_num: int):
        super(SentimentCNN, self).__init__()

        self.embedding = (nn.Embedding(num_embeddings, embedding_dim, padding_idx))

        self.conv1 = (ConvBlock(embedding_dim, feature_maps_num, 3))
        self.conv2 = (ConvBlock(embedding_dim, feature_maps_num, 4))
        self.conv3 = (ConvBlock(embedding_dim, feature_maps_num, 5))
        
        self.fc = (nn.Linear(3 * feature_maps_num, output_dim))
        self.dropout = (nn.Dropout(dropout_rate))
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x = x.permute(0,2,1)

        x1 = self.conv1(x)
        x2 = self.conv2(x)
        x3 = self.conv3(x)

        x = torch.cat([x1, x2, x3], dim=-1)
        x = self.dropout(x)
        x = self.fc(x)

        return x

In [12]:
epochs = 20
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
batch_size = 20
num_embeddings = len(all_data.vocab)
embedding_dim = 50
dropout_rate = 0.5
output_dim = 3
feature_maps_num = 100
weight_decay = 1e-5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [13]:
collate = functools.partial(pad_tensors, pad_index=pad_index)
train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size, collate_fn=collate)
val_loader = DataLoader(val_set, shuffle=True, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_set, shuffle=True, batch_size=batch_size, collate_fn=collate)

In [14]:
model = SentimentCNN(num_embeddings, embedding_dim, pad_index, dropout_rate, output_dim, feature_maps_num)
model.to(device=device)
optimizer = torch.optim.Adadelta(model.parameters(), foreach=True)

In [15]:
%load_ext tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [16]:
for epoch in range(1, epochs + 1):
    model.train()
    train_loss = []

    bar = tqdm(train_loader, position=0, leave=True,
               desc='epoch %d' % epoch)
    for batch in bar:
        ids, labels = batch

        logits = model(ids)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss.append(loss)

    avg_train_loss = torch.stack(train_loss).mean()
    writer.add_scalar('Loss/train', avg_train_loss, epoch)
    print(epoch, '   train_loss', avg_train_loss.item())

    model.eval()
    with torch.no_grad():
        val_loss = []
        for batch in val_loader:
            ids, labels = batch

            logits = model(ids)
            loss = criterion(logits, labels)

            val_loss.append(loss)
        avg_val_loss = torch.stack(val_loss).mean()
        writer.add_scalar('Loss/validation', avg_val_loss, epoch)
        print(epoch, '   val_loss', avg_val_loss.item())


epoch 1:   0%|          | 0/9 [00:00<?, ?it/s]

  return F.conv1d(input, weight, bias, self.stride,


1    train_loss 0.5141205787658691
1    val_loss 0.49164122343063354


epoch 2:   0%|          | 0/9 [00:00<?, ?it/s]

2    train_loss 0.35927486419677734
2    val_loss 0.457973837852478


epoch 3:   0%|          | 0/9 [00:00<?, ?it/s]

3    train_loss 0.24405111372470856
3    val_loss 0.43390730023384094


epoch 4:   0%|          | 0/9 [00:00<?, ?it/s]

4    train_loss 0.18245826661586761
4    val_loss 0.5356175899505615


epoch 5:   0%|          | 0/9 [00:00<?, ?it/s]

5    train_loss 0.12524528801441193
5    val_loss 0.4773199260234833


epoch 6:   0%|          | 0/9 [00:00<?, ?it/s]

6    train_loss 0.10725338011980057
6    val_loss 0.7081089019775391


epoch 7:   0%|          | 0/9 [00:00<?, ?it/s]

7    train_loss 0.09060462564229965
7    val_loss 0.5426703691482544


epoch 8:   0%|          | 0/9 [00:00<?, ?it/s]

8    train_loss 0.06071317940950394
8    val_loss 0.6076631546020508


epoch 9:   0%|          | 0/9 [00:00<?, ?it/s]

9    train_loss 0.043578874319791794
9    val_loss 0.5838634371757507


epoch 10:   0%|          | 0/9 [00:00<?, ?it/s]

10    train_loss 0.028925154358148575
10    val_loss 0.6130650639533997


epoch 11:   0%|          | 0/9 [00:00<?, ?it/s]

11    train_loss 0.027547722682356834
11    val_loss 0.6791728138923645


epoch 12:   0%|          | 0/9 [00:00<?, ?it/s]

12    train_loss 0.014116846956312656
12    val_loss 0.6621667742729187


epoch 13:   0%|          | 0/9 [00:00<?, ?it/s]

13    train_loss 0.019220616668462753
13    val_loss 0.6305027604103088


epoch 14:   0%|          | 0/9 [00:00<?, ?it/s]

14    train_loss 0.013695638626813889
14    val_loss 0.7809904217720032


epoch 15:   0%|          | 0/9 [00:00<?, ?it/s]

15    train_loss 0.014398481696844101
15    val_loss 0.8242156505584717


epoch 16:   0%|          | 0/9 [00:00<?, ?it/s]

16    train_loss 0.013161089271306992
16    val_loss 0.7498230934143066


epoch 17:   0%|          | 0/9 [00:00<?, ?it/s]

17    train_loss 0.0123518668115139
17    val_loss 0.7850815057754517


epoch 18:   0%|          | 0/9 [00:00<?, ?it/s]

18    train_loss 0.009659448638558388
18    val_loss 0.7496472597122192


epoch 19:   0%|          | 0/9 [00:00<?, ?it/s]

19    train_loss 0.005958361551165581
19    val_loss 0.7796704769134521


epoch 20:   0%|          | 0/9 [00:00<?, ?it/s]

20    train_loss 0.004821117501705885
20    val_loss 0.8812891840934753
