In [102]:
import functools

from enum import Enum

import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import spacy

from sklearn import preprocessing

from transformers import BertTokenizer

import pandas as pd

from tqdm.notebook import tqdm

from torch.utils.tensorboard import SummaryWriter

In [103]:
class TokenizerType(Enum):
    SPACY = 1
    BERT = 2

In [112]:
PAD_TOKEN = "<pad>"

In [104]:
def tokenize_and_filter(df: pd.DataFrame):
    filtered_reviews = []
    # tokenizer = torchtext.data.utils.get_tokenizer("spacy", language="en_core_web_sm")
    tokenizer = spacy.load("en_core_web_sm")

    # tokenize reviews and filter stopwords
    for review in df["reviews"]:
        # create document object with lingustic annotations
        document = tokenizer(review)

        # go through tokens and exclude stopwords
        document = [token.lemma_ for token in document if (len(token.lemma_) > 1) and (token.lemma_.isalnum()) and (not token.is_stop)]
        # document = ' '.join(document)
        filtered_reviews.append(document)

    return filtered_reviews

In [105]:
def numericalize_data(all_tokens, vocab):
    ids = []
    for token_set in all_tokens:
        ids.append([vocab[token] for token in token_set])
    
    return ids

In [106]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, filepaths: list[str], tokenizer: TokenizerType = TokenizerType.SPACY):
        df = pd.concat(map(pd.read_csv, filepaths))

        self.le = preprocessing.LabelEncoder()
        self.labels = self.le.fit_transform(df["sentiment"]) 
        
        if tokenizer == TokenizerType.SPACY:
            self.reviews = tokenize_and_filter(df)
            self.vocab = torchtext.vocab.build_vocab_from_iterator(self.reviews, specials=[PAD_TOKEN])
            self.ids = numericalize_data(self.reviews, self.vocab)
        else:
            pass

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.LongTensor(self.ids[idx]), torch.tensor(self.labels[idx], dtype=torch.int8)

In [107]:
all_data = AmazonReviewsDataset(["../data-scraper/dishwashing tablet_reviews.csv", "../data-scraper/mug_reviews.csv", "../data-scraper/washing powder_reviews.csv"])
PAD_INDEX = all_data.vocab["<pad>"]

In [108]:
def pad_tensors(batch, pad_index):
    ids = []
    labels = []

    for batch_ids, batch_labels in batch:
        ids.append(batch_ids)
        labels.append(batch_labels)

    ids = nn.utils.rnn.pad_sequence(ids, padding_value=pad_index, batch_first=True)
    labels = torch.stack(labels)

    return ids, labels

In [109]:
collate = functools.partial(pad_tensors, pad_index=all_data.vocab["<pad>"])
all_data_loader = DataLoader(all_data, shuffle=True, batch_size=50, collate_fn=collate)