## Import data into pandas
---

In [1]:
import pandas as pd
import numpy as np
import torch
from torchtext.data import Field, TabularDataset, Iterator, BucketIterator

In [2]:
imdb = pd.read_csv('imdb_labelled.txt', sep='\t', header=None)
imdb.columns = ['reviews', 'sentiment']
imdb.tail()

Unnamed: 0,reviews,sentiment
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0
747,All in all its an insult to one's intelligence...,0


In [3]:
imdb.shape

(748, 2)

In [4]:
imdb.loc[imdb['sentiment']==1].shape

(386, 2)

In [5]:
data = imdb['reviews']
labels = imdb['sentiment']
VALIDATION_SPLIT = 0.2

In [6]:
# Split data into a training and validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [7]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [8]:
train_df = pd.concat([x_train, y_train], axis=1)
train_df.head()

Unnamed: 0,reviews,sentiment
328,His losing his marbles so early in the proceed...,0
606,The attempts at humor were pitiful and story i...,0
523,So mediocre in every aspect that it just becom...,0
40,"Frankly, after Cotton club and Unfaithful, it ...",0
283,"And, FINALLY, after all that, we get to an end...",0


In [9]:
val_df = pd.concat([x_val, y_val], axis=1)
val_df.head()

Unnamed: 0,reviews,sentiment
382,"This was reflected not only in the cast, but a...",0
226,This show is made for Americans - it is too st...,0
435,I know that Jim O'Connor was very energetic an...,0
256,"I wasn't expecting Oscar material, but this?",0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [10]:
train_df.shape, val_df.shape

((599, 2), (149, 2))

## Save to csv files
---

In [11]:
train_df.to_csv('train.csv', sep='\t', index=False)
val_df.to_csv('val.csv', sep='\t', index=False)

## PyTorch time
---

In [12]:
tokenize = lambda x: x.split()
# Tokenize the reviews and make sure they're all lowercase
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
# Labels are preprocessed here, so set use_vocab = False
LABEL = Field(sequential=False, use_vocab=False)

In [13]:
trn_datafields = [("reviews", TEXT), ("sentiment", LABEL)]
val_datafields = [("reviews", TEXT), ("sentiment", LABEL)]
trn = TabularDataset(path = "train.csv", format='csv', skip_header=True, fields=trn_datafields)
val = TabularDataset(path = "train.csv", format='csv', skip_header=True, fields=val_datafields)

In [14]:
# To map words to integers we have to tell it what the entire vocabulary is. We run this on the training set.
TEXT.build_vocab(trn)

In [16]:
# Most frequent words
TEXT.vocab.freqs.most_common(10)

[('the', 486),
 ('0', 248),
 ('1', 245),
 ('a', 229),
 ('of', 216),
 ('is', 197),
 ('and', 186),
 ('this', 168),
 ('i', 156),
 ('to', 144)]

In [17]:
# First Element
trn[0]

<torchtext.data.example.Example at 0x7fca02220710>

In [18]:
trn[0].__dict__.keys()

dict_keys(['reviews'])

In [20]:
trn[0].reviews[:5]

['his', 'losing', 'his', 'marbles', 'so']

## Load Data into our model using an iterator
---

In [32]:
train_iter, val_iter = BucketIterator.splits(
                    (trn, val), 
                    batch_size=(64, 64), 
                    # Initiate GPU
                    device=0,
                    # What to use to pad data
                    sort_key=lambda x: len(x.reviews),
                    sort_within_batch=False,
                    # Going to wrap this in an Iterator Layer
                    repeat=False)

In [33]:
batch = next(train_iter.__iter__()); batch

TypeError: '>' not supported between instances of 'int' and 'tuple'

In [35]:
batch.__dict__.keys()

NameError: name 'batch' is not defined