In [31]:
import functools
import sys
from time import sleep

from enum import Enum

import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import spacy
import spacy_fastlang

from sklearn import preprocessing

from transformers import BertTokenizer

import pandas as pd

from tqdm.notebook import tqdm

from torch.utils.tensorboard import SummaryWriter

In [5]:
class TokenizerType(Enum):
    SPACY = 1
    BERT = 2

In [6]:
PAD_TOKEN = "<pad>"
PADDING_MODE = "same"

In [7]:
def tokenize_and_filter(df: pd.DataFrame):
    filtered_reviews = []
    labels = []

    tokenizer = spacy.load("en_core_web_sm")
    tokenizer.add_pipe("language_detector")

    # tokenize reviews and filter stopwords
    for i, review in enumerate(df["reviews"]):
        if isinstance(review, float):
            continue

        # create document object with lingustic annotations
        document = tokenizer(review)
        
        if document._.language != "en" and document._.language_score >= 0.7:
            continue

        # go through tokens and exclude stopwords
        document = [token.lemma_ for token in document if (len(token.lemma_) > 1) and (token.lemma_.isalnum()) and (not token.is_stop)]
        filtered_reviews.append(document)
        labels.append(df["sentiment"].iloc[i])

    return filtered_reviews, labels

In [8]:
def numericalize_data(all_tokens, vocab):
    ids = []
    for token_set in all_tokens:
        ids.append([vocab[token] for token in token_set])
    
    return ids

In [9]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, filepaths: list[str], tokenizer: TokenizerType = TokenizerType.SPACY):
        df = pd.concat(map(pd.read_csv, filepaths))

        self.le = preprocessing.LabelEncoder()
        # self.labels = self.le.fit_transform(df["sentiment"]) 
        
        if tokenizer == TokenizerType.SPACY:
            self.reviews, self.labels = tokenize_and_filter(df)
            self.labels = self.le.fit_transform(self.labels)
            self.vocab = torchtext.vocab.build_vocab_from_iterator(self.reviews, specials=[PAD_TOKEN])
            self.ids = numericalize_data(self.reviews, self.vocab)
        else:
            pass

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.LongTensor(self.ids[idx]), torch.tensor(self.labels[idx], dtype=torch.long)

In [10]:
all_data = AmazonReviewsDataset(["../data-scraper/dishwashing tablet_reviews.csv", "../data-scraper/mug_reviews.csv", "../data-scraper/washing powder_reviews.csv"])
pad_index = all_data.vocab["<pad>"]



In [11]:
def pad_tensors(batch, pad_index):
    ids = []
    labels = []

    for batch_ids, batch_labels in batch:
        ids.append(batch_ids)
        labels.append(batch_labels)

    ids = nn.utils.rnn.pad_sequence(ids, padding_value=pad_index, batch_first=True)
    labels = torch.stack(labels)

    return ids, labels

In [12]:
train_set, val_set, test_set = torch.utils.data.random_split(all_data, [0.7, 0.15, 0.15])

In [13]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, filter_size: int):
        super().__init__()

        self.linear_stack = nn.Sequential(
            nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
                      padding=PADDING_MODE, kernel_size=filter_size, bias=False),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear_stack(x)
        return x.max(dim=-1).values

In [14]:
class SentimentCNN(nn.Module):
    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, dropout_rate: float, output_dim: int, feature_maps_num: int):
        super(SentimentCNN, self).__init__()

        self.embedding = (nn.Embedding(num_embeddings, embedding_dim, padding_idx))

        self.conv1 = (ConvBlock(embedding_dim, feature_maps_num, 3))
        self.conv2 = (ConvBlock(embedding_dim, feature_maps_num, 4))
        self.conv3 = (ConvBlock(embedding_dim, feature_maps_num, 5))
        
        self.fc = (nn.Linear(3 * feature_maps_num, output_dim))
        self.dropout = (nn.Dropout(dropout_rate))
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x = x.permute(0,2,1)

        x1 = self.conv1(x)
        x2 = self.conv2(x)
        x3 = self.conv3(x)

        x = torch.cat([x1, x2, x3], dim=-1)
        x = self.dropout(x)
        x = self.fc(x)

        return x

In [45]:
epochs = 200
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
batch_size = 50
num_embeddings = len(all_data.vocab)
embedding_dim = 100
dropout_rate = 0.5
output_dim = 3
feature_maps_num = 100
weight_decay = 1e-3
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [48]:
collate = functools.partial(pad_tensors, pad_index=pad_index)
train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size, collate_fn=collate)
val_loader = DataLoader(val_set, shuffle=True, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_set, shuffle=True, batch_size=batch_size, collate_fn=collate)

In [49]:
model = SentimentCNN(num_embeddings, embedding_dim, pad_index, dropout_rate, output_dim, feature_maps_num)
model.to(device=device)
optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay, lr=learning_rate)

In [35]:
writer = SummaryWriter()

In [50]:
train_bar = tqdm(total=len(train_loader), desc="Training", unit="batch")
val_bar = tqdm(total=len(val_loader), desc="Validation", unit="batch")

for epoch in range(1, epochs + 1):
    model.train()
    train_loss = []

    train_bar.reset()
    val_bar.reset()

    for batch in train_loader:
        ids, labels = batch

        logits = model(ids)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss.append(loss)

        train_bar.set_postfix(epoch=epoch, loss=loss.item())
        train_bar.update()

    avg_train_loss = torch.stack(train_loss).mean()
    writer.add_scalar('Loss/train', avg_train_loss, epoch)

    model.eval()
    with torch.no_grad():
        val_loss = []

        for batch in val_loader:
            ids, labels = batch

            logits = model(ids)
            loss = criterion(logits, labels)

            val_loss.append(loss)

            val_bar.set_postfix(epoch=epoch, loss=loss.item())
            val_bar.update()

        avg_val_loss = torch.stack(val_loss).mean()
        writer.add_scalar('Loss/validation', avg_val_loss, epoch)

Training:   0%|          | 0/78 [00:00<?, ?batch/s]

Validation:   0%|          | 0/17 [00:00<?, ?batch/s]