In [15]:
from enum import Enum

import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn import preprocessing

from transformers import BertTokenizer

import pandas as pd

from tqdm.notebook import tqdm

from torch.utils.tensorboard import SummaryWriter

In [12]:
class TokenizerType(Enum):
    SPACY = 1
    BERT = 2

In [14]:
def tokenize_and_filter(df: pd.DataFrame):
    filtered_reviews = []
    tokenizer = torchtext.data.utils.get_tokenizer("spacy", language="en_core_web_sm")

    # tokenize reviews and filter stopwords
    for review in df["reviews"]:
        # create document object with lingustic annotations
        document = tokenizer(abstract)

        # go through tokens and exclude stopwords
        document = [token.lemma_ for token in document if (len(token.lemma_) > 1) and (token.lemma_.isalnum()) and (not token.is_stop)]
        document = ' '.join(document)
        filtered_reviews.append(document)

    return filtered_reviews

In [None]:
def numericalize_data(example, vocab):
    ids = [vocab[token] for token in example['tokens']]
    return {'ids': ids}

In [None]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, filepaths, tokenizer: TokenizerType = TokenizerType.SPACY):
        df = pd.concat(map(pd.read_csv, filepaths))

        self.le = preprocessing.LabelEncoder()
        self.labels = self.le.fit_transform(df["sentiment"].unique())

        # self.tokenizer = torchtext.data.utils.get_tokenizer("spacy", language="en_core_web_sm") if tokenizer == "spacy" else BertTokenizer.from_pretrained('bert-base-uncased')
        
        if tokenizer == TokenizerType.SPACY:
            filtered_reviews = tokenize_and_filter(df)
            torchtext.vocab.build_vocab_from_iterator(filtered_reviews)


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        pass