## Requirements

In [45]:
from torchtext import data
import spacy
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizerFast, BertForSequenceClassification
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from tokenizers import BertWordPieceTokenizer
from pathlib import Path

## Train Tokenizer

In [46]:
paths = ['../raw_data/train.csv', '../raw_data/val.csv']

In [47]:
tokenizer = BertWordPieceTokenizer(clean_text=True, strip_accents=True)

In [48]:
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, show_progress=True)
tokenizer.save_model("steam_tokenized")

['steam_tokenized/vocab.txt']

## Load Dataset

In [49]:
# using tokenizer from huggingface
tokenizer = BertTokenizerFast.from_pretrained('./steam_tokenized', show_progress=True)
tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

1

In [51]:
# the two columns we will be using: text(review), label(sentiment)
txt_field = data.Field(sequential=True,
                       use_vocab=False,
                       tokenize=tokenizer.encode,
                       include_lengths=True)

lbl_field = data.Field(sequential=False,
                       use_vocab=False,
                       pad_token=None,
                       unk_token=None)

fields = [
    ('review', txt_field),
    ('sentiment', lbl_field)
]

In [54]:
%%time
# load up the train and validation sets
train_ds, val_ds = data.TabularDataset.splits(path='../raw_data', 
                                              train='train.csv', 
                                              validation='val.csv', 
                                              format='csv', 
                                              fields=fields, 
                                              skip_header=False)

CPU times: user 6min 27s, sys: 1.53 s, total: 6min 29s
Wall time: 6min 29s


In [58]:
gpu = torch.device('cuda:0')

In [61]:
train_iter = data.BucketIterator(train_ds, batch_size=64, sort_key=lambda x: len(x.text), device=gpu, train=True, sort=True, sort_within_batch=True)
valid_iter = data.BucketIterator(val_ds, batch_size=64, sort_key=lambda x: len(x.text), device=gpu, train=True, sort=True, sort_within_batch=True)