In [1]:
import functools
import sys
from time import sleep

from enum import Enum

import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchmetrics
from torchmetrics import Accuracy, Precision, F1Score

import spacy
import spacy_fastlang

from sklearn import preprocessing

from transformers import BertTokenizer, BertModel

import pandas as pd

from tqdm.notebook import tqdm

from torch.utils.tensorboard import SummaryWriter

In [2]:
class TokenizerType(Enum):
    SPACY = 1
    BERT = 2

In [3]:
class ModelType(Enum):
    CNN = 1
    BERT = 2

In [4]:
PAD_TOKEN = "<pad>"
PADDING_MODE = "same"

In [5]:
def tokenize_and_filter(df: pd.DataFrame):
    filtered_reviews = []
    labels = []

    tokenizer = spacy.load("en_core_web_sm")
    tokenizer.add_pipe("language_detector")

    # tokenize reviews and filter stopwords
    for i, review in enumerate(df["reviews"]):
        if isinstance(review, float):
            continue

        # create document object with lingustic annotations
        document = tokenizer(review)
        
        if document._.language != "en" and document._.language_score >= 0.7:
            continue

        # go through tokens and exclude stopwords
        document = [token.lemma_ for token in document if (len(token.lemma_) > 1) and (token.lemma_.isalnum()) and (not token.is_stop)]
        filtered_reviews.append(document)
        labels.append(df["sentiment"].iloc[i])

    return filtered_reviews, labels

In [6]:
def numericalize_data(all_tokens, vocab):
    ids = []
    for token_set in all_tokens:
        ids.append([vocab[token] for token in token_set])
    
    return ids

In [7]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, filepaths, tokenizer: TokenizerType = TokenizerType.SPACY):
        df = pd.concat(map(pd.read_csv, filepaths))

        self.le = preprocessing.LabelEncoder()
        # self.labels = self.le.fit_transform(df["sentiment"]) 
        
        if tokenizer == TokenizerType.SPACY:
            self.reviews, self.labels = tokenize_and_filter(df)
            self.labels = self.le.fit_transform(self.labels)
            self.vocab = torchtext.vocab.build_vocab_from_iterator(self.reviews, specials=[PAD_TOKEN])
            self.ids = numericalize_data(self.reviews, self.vocab)
        else:
            pass

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.LongTensor(self.ids[idx]), torch.tensor(self.labels[idx], dtype=torch.long)

In [8]:
all_data = AmazonReviewsDataset(["../data-scraper/dishwashing tablet_reviews.csv", "../data-scraper/mug_reviews.csv", "../data-scraper/washing powder_reviews.csv"])
pad_index = all_data.vocab["<pad>"]



In [9]:
def pad_tensors(batch, pad_index):
    ids = []
    labels = []

    for batch_ids, batch_labels in batch:
        ids.append(batch_ids)
        labels.append(batch_labels)

    ids = nn.utils.rnn.pad_sequence(ids, padding_value=pad_index, batch_first=True)
    labels = torch.stack(labels)

    return ids, labels

In [10]:
len(all_data)

5550

In [11]:
train_set, val_set, test_set = torch.utils.data.random_split(all_data, [0.8, 0.1, 0.1])

In [12]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int, filter_size: int):
        super().__init__()

        self.linear_stack = nn.Sequential(
            nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
                      padding=PADDING_MODE, kernel_size=filter_size, bias=False),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear_stack(x)
        return x.max(dim=-1).values

In [13]:
class SentimentCNN(nn.Module):
    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, dropout_rate: float, output_dim: int, feature_maps_num: int):
        super(SentimentCNN, self).__init__()

        self.embedding = (nn.Embedding(num_embeddings, embedding_dim, padding_idx))

        self.conv1 = (ConvBlock(embedding_dim, feature_maps_num, 3))
        self.conv2 = (ConvBlock(embedding_dim, feature_maps_num, 4))
        self.conv3 = (ConvBlock(embedding_dim, feature_maps_num, 5))
        
        self.fc = (nn.Linear(3 * feature_maps_num, output_dim))
        self.dropout = (nn.Dropout(dropout_rate))
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        x = x.permute(0,2,1)

        x1 = self.conv1(x)
        x2 = self.conv2(x)
        x3 = self.conv3(x)

        x = torch.cat([x1, x2, x3], dim=-1)
        x = self.dropout(x)
        x = self.fc(x)

        return x

In [14]:
class SentimentBERT(nn.Module):
    def __init__(self, hidden_dim: int, n_layers, bidirectional: bool, dropout_rate: float, output_dim: int):  
        super(SentimentBERT, self).__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        embedding_dim = self.bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout_rate)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, text):
        with torch.no_grad():
            embedded = self.bert(text)[0]
        
        _, hidden = self.rnn(embedded)
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        
        output = self.out(hidden)
        
        return output

In [16]:
import json

with open('config.json') as f:
    parameters = json.load(f)
    print(parameters)

criterion = nn.CrossEntropyLoss()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_type = ModelType.CNN
num_embeddings = len(all_data.vocab)

{'epochs': 50, 'learning_rate': 0.001, 'batch_size': 50, 'weight_decay': 0, 'embedding_dim': 300, 'dropout_rate_cnn': 0.5, 'output_dim_cnn': 3, 'feature_maps_num': 100, 'hidden_dim': 256, 'n_layers': 2, 'bidirectional': True, 'dropout_rate_bert': 0.5, 'output_dim_bert': 0.5}


In [27]:
vectors = torchtext.vocab.FastText()

In [28]:
pretrained_embedding = vectors.get_vecs_by_tokens(all_data.vocab.get_itos())

In [56]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv1d):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')

In [18]:
collate = functools.partial(pad_tensors, pad_index=pad_index)
train_loader = DataLoader(train_set, shuffle=True, batch_size=parameters["batch_size"], collate_fn=collate)
val_loader = DataLoader(val_set, shuffle=True, batch_size=parameters["batch_size"], collate_fn=collate)
test_loader = DataLoader(test_set, shuffle=True, batch_size=parameters["batch_size"], collate_fn=collate)

In [19]:
if model_type == ModelType.CNN:
    model = SentimentCNN(num_embeddings, parameters["embedding_dim"], pad_index,
                         parameters["dropout_rate_cnn"], parameters["output_dim_cnn"], parameters["feature_maps_num"])
    model.to(device=device)
    optimizer = torch.optim.Adam(
        model.parameters(), weight_decay=parameters["weight_decay"], lr=parameters["learning_rate"])
else:
    model = SentimentBERT(parameters["hidden_dim"], parameters["n_layers"], parameters["bidirectional"],
                          parameters["dropout_rate_bert"], parameters["output_dim_bert"])
    for name, param in model.named_parameters():
        if name.startswith('bert'):
            param.requires_grad = False
    model.to(device=device)
    optimizer = torch.optim.Adam(
        model.parameters(), weight_decay=parameters["weight_decay"], lr=parameters["learning_rate"])


In [59]:
model.apply(initialize_weights)

SentimentCNN(
  (embedding): Embedding(6575, 300, padding_idx=0)
  (conv1): ConvBlock(
    (linear_stack): Sequential(
      (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,), padding=same, bias=False)
      (1): ReLU(inplace=True)
    )
  )
  (conv2): ConvBlock(
    (linear_stack): Sequential(
      (0): Conv1d(300, 100, kernel_size=(4,), stride=(1,), padding=same, bias=False)
      (1): ReLU(inplace=True)
    )
  )
  (conv3): ConvBlock(
    (linear_stack): Sequential(
      (0): Conv1d(300, 100, kernel_size=(5,), stride=(1,), padding=same, bias=False)
      (1): ReLU(inplace=True)
    )
  )
  (fc): Linear(in_features=300, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [33]:
model.embedding.weight.data = pretrained_embedding

In [20]:
writer = SummaryWriter()

In [22]:
best_valid_loss = float('inf')

train_bar = tqdm(total=len(train_loader), desc="Training", unit="batch")
val_bar = tqdm(total=len(val_loader), desc="Validation", unit="batch")

for epoch in range(1, parameters["epochs"] + 1):
    model.train()
    train_loss = []

    train_bar.reset()
    val_bar.reset()

    for batch in train_loader:
        ids, labels = batch

        ids = ids.to(device=device)
        labels = labels.to(device=device)

        logits = model(ids)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss.append(loss)

        train_bar.set_postfix(epoch=epoch, loss=loss.item())
        train_bar.update()

    avg_train_loss = torch.stack(train_loss).mean()
    writer.add_scalar('Loss/train', avg_train_loss, epoch)

    model.eval()
    with torch.no_grad():
        val_loss = []

        for batch in val_loader:
            ids, labels = batch
            ids = ids.to(device=device)
            labels = labels.to(device=device)

            logits = model(ids)
            loss = criterion(logits, labels)

            val_loss.append(loss)

            val_bar.set_postfix(epoch=epoch, loss=loss.item())
            val_bar.update()

        avg_val_loss = torch.stack(val_loss).mean()
        writer.add_scalar('Loss/validation', avg_val_loss, epoch)

        if avg_val_loss < best_valid_loss:
            best_valid_loss = avg_val_loss
            torch.save(model.state_dict(), 'cnn.pt')

Training:   0%|          | 0/89 [00:00<?, ?batch/s]

Validation:   0%|          | 0/12 [00:00<?, ?batch/s]

  return F.conv1d(input, weight, bias, self.stride,


In [53]:
def evaluate(model, iterator, criterion):
    
    loss_all = 0
    acc_all = 0
    prec_all = 0
    f1_all = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            ids, labels = batch
            ids = ids.to(device=device)
            labels = labels.to(device=device)

            logits = model(ids)
            
            loss = criterion(logits, labels)

            accuracy = Accuracy(task="multiclass", num_classes=3)
            accuracy = accuracy.to(device=device)
            acc = accuracy(logits, labels)

            precision = Precision(task="multiclass", average='macro', num_classes=3)
            precision = precision.to(device=device)
            prec = precision(logits, labels)

            f1score = F1Score(task="multiclass", num_classes=3)
            f1score = f1score.to(device=device)
            f1 = f1score(logits, labels)

            loss_all += loss.item()
            acc_all += acc.item()
            prec_all += prec.item()
            f1_all += f1.item()
        
    return loss_all / len(iterator), acc_all / len(iterator), prec_all / len(iterator), f1_all / len(iterator)

In [62]:
model.load_state_dict(torch.load('cnn.pt'))

test_loss, test_acc, test_prec, test_f1 = evaluate(model, test_loader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f} | Test Precision: \{test_prec:.3f} | Test F1: {test_f1:.3f}')

Test Loss: 0.768 | Test Acc: 69.00 | Test Precision: \0.689 | Test F1: 0.690
