## Preview Dataset

In [3]:
import pandas as pd

In [9]:
# this is just a preview; the data is already split in train and test sets
df = pd.read_csv('../raw_data/steamset_cleaned.csv')
df = df[['review', 'sentiment']]

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,i think that games based on nuclear war beyond...,1
1,"man ever since this second dlc was dropped, (t...",0
2,tl;dr - what's available right now leaves me w...,1
3,great small game. is not worth it's price. bes...,1
4,"this game, in my personal opinion, is amazing....",1


## Load Dataset w/ TorchText

In [6]:
import spacy                # for tokenizer
from torchtext import data

In [7]:
# defining tokenizer
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])

def tokenizer(s):
    return [w.text.lower() for w in nlp(s)]

In [12]:
# the two columns we will be using: text(review), label(sentiment)
txt_field = data.Field(sequential=True,
                       use_vocab=True,
                       tokenize=tokenizer,
                       include_lengths=True)

lbl_field = data.Field(sequential=False,
                       use_vocab=False,
                       pad_token=None,
                       unk_token=None)

fields = [
    ('review', txt_field),
    ('sentiment', lbl_field)
]

In [14]:
# load up the train and validation sets
train_ds, val_ds = data.TabularDataset.splits(path='../raw_data',
                                              format='csv',
                                              train='train.csv',
                                              validation='validate.csv',
                                              fields=fields,
                                              skip_header=False)