### Imports

In [1]:
#sys libs
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandarallel import pandarallel
# Initialization
pandarallel.initialize()


#string manupulation libs
import re
import string
from string import digits
import spacy

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import spacy
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')


INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Download Data

In [1]:
# !wget https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
# !gunzip -f movie_data.csv.gz

In [2]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [20]:
class IMDBDataset(Dataset):
    def __init__(self, df, tokenizer, min_freq=5):
        self.tokenizer = tokenizer
        self.min_freq = min_freq
        self.df = df      
        self.counter = Counter()
        for item in self.df['review']:
            self.counter.update(self.tokenizer(item))
        self.vocab = vocab(self.counter, min_freq=self.min_freq, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
        self.vocab.set_default_index(self.vocab['<unk>'])
        self.text_transform = lambda x: [self.vocab['<BOS>']] + [self.vocab[token] for token in tokenizer(x)] + [self.vocab['<EOS>']]

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.text_transform(self.df.iloc[idx]['review'])
        label = self.df.iloc[idx]['sentiment']
        return torch.tensor(text), torch.tensor(label)        



In [21]:
imdb = IMDBDataset(df, tokenizer, min_freq=5)

In [22]:
next(iter(imdb))

(tensor([  1,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,
           7,  17,  18,  19,  20,  21,  22,  23,   6,  24,   6,  25,  26,  27,
           7,   0,  28,   6,  29,  21,  30,   6,  31,  32,  33,  34,   7,  35,
          21,  36,  37,  38,  36,  39,  40,  41,  26,  42,  18,  43,  44,  45,
           6,   7,  46,  47,  48,  11,  49,  50,  14,   6,  51,  52,  53,  54,
          55,  56,  57,  58,  59,  34,  60,  61,   0,  34,  62,  63,  64,  38,
          65,  16,  66,   6,  67,  16,  68,   7,  69,  70,  71,  72,  73,  74,
          11,  75,  76,  14,  70,   7,  77,  21,  78,  53,  79,  26,  80,  81,
          82,  38,  83,  84,  85,  86,   6,  87,  70,   7,  88,  21,   7,  89,
          56,  90,  91,  11,  92,  93,  14,  57,  32,  34,  94,  21,   7,  95,
          34,   7,  96,  97,   6,  98,  99,   7, 100,  38,  53, 101,  21, 102,
          38, 103,  16, 104,   7, 105, 106, 107,  34,  24, 108,  52,  53, 109,
         110, 111,   6,  70,   7, 112, 113,  21,  53

In [74]:
class IMDBCollate:
    def __init__(self, pad_idx, batch_first=True):
        self.pad_idx = pad_idx
        self.batch_first = batch_first
        
    def __call__(self, batch):
        label_list, text_list = [], []
        for (_text, _label) in batch:
            label_list.append(_label)
            text_list.append(_text)
        return pad_sequence(text_list, padding_value=3.0, batch_first=self.batch_first), torch.tensor(label_list)

In [75]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch, pad_idx=3.0):
   label_list, text_list = [], []
   for (_text, _label) in batch:
      label_list.append(_label)
      processed_text = (_text)
      text_list.append(processed_text)
   return pad_sequence(text_list, padding_value=3.0, batch_first=True), torch.tensor(label_list)

# train_dataloader = DataLoader(imdb, batch_size=2, shuffle=True, 
#                               collate_fn=IMDBCollate(pad_idx=imdb.vocab['<PAD>']))

# train_dataloader = DataLoader(imdb, batch_size=2, shuffle=True, collate_fn=collate_batch)
train_dataloader = DataLoader(imdb, batch_size=2, shuffle=True, collate_fn=IMDBCollate(pad_idx=imdb.vocab['<PAD>']))

In [76]:
texts_, labels_ = next(iter(train_dataloader))

In [78]:
texts_.shape, labels_.shape

(torch.Size([2, 339]), torch.Size([2]))

In [79]:
len(df)

50000