In [1]:
import pandas as pd
import numpy as np


In [2]:
VAL_RATIO = 0.2
def prepare_csv(seed=999):
    
    df_train_EN = pd.read_csv('data/en-de/train.en',sep='\n',names = ['text'])
    df_train_DE = pd.read_csv('data/en-de/train.de',sep='\n',names = ['text'])
    df_train_EN["text"] = \
        df_train_EN.text.str.replace("\n", " ")
    df_train_DE["text"] = \
        df_train_DE.text.str.replace("\n", " ")
    idx = np.arange(df_train_EN.shape[0])
    
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * VAL_RATIO)
    
    df_train_EN.iloc[idx[val_size:], :].to_csv(
        "data/en-de/train_csv.en", index=False)
    df_train_EN.iloc[idx[:val_size], :].to_csv(
        "data/en-de/val_csv.en", index=False)
    df_train_DE.iloc[idx[val_size:], :].to_csv(
        "data/en-de/train_csv.de", index=False)
    df_train_DE.iloc[idx[:val_size], :].to_csv(
        "data/en-de/val_csv.de", index=False)


In [4]:
prepare_csv()

In [5]:
import re
import spacy
NLP = spacy.load('en')
MAX_CHARS = 20000

def tokenizer(comment):
    comment = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", 
        str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment) > MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return [
        x.text for x in NLP.tokenizer(comment) if x.text != " "]

In [6]:
import logging
import torch
from torchtext import data

In [19]:
LOGGER = logging.getLogger("toxic_dataset")

def get_dataset(fix_length=100, lower=True, vectors=None):
#     if vectors is not None:
#         # pretrain vectors only supports all lower cases
#         lower = True
    LOGGER.debug("Preparing CSV files...")
    #prepare_csv()
    
    comment = data.Field(
        sequential=True,
        fix_length=fix_length,
        tokenize=tokenizer,
        pad_first=True,
        #tensor_type=torch.cuda.LongTensor,
        lower=lower
    )
    
    LOGGER.debug("Reading train csv file...")
    train, val = data.TabularDataset.splits(
        path='data/en-de', format='csv', skip_header=True,
        train='train_csv.en', validation='val_csv.en',
        fields=[('comment_text', comment)])
    
    LOGGER.debug("Building vocabulary...")
    comment.build_vocab(
        train, val,
        max_size=20000,
        min_freq=50,
#         vectors=vectors
    )
    LOGGER.debug("Done preparing the datasets")
    return train, val

In [20]:
train,val = get_dataset()

In [22]:
def get_iterator(dataset, batch_size, train=True, 
    shuffle=True, repeat=False):
    dataset_iter = data.Iterator(
        dataset, batch_size=batch_size, device=0,
        train=train, shuffle=shuffle, repeat=repeat,
        sort=False
    )
    return dataset_iter

In [29]:
for examples in get_iterator(
            train, 32, train=True,
            shuffle=True, repeat=False
        ):
    x = examples
    print(x)
    break

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.



[torchtext.data.batch.Batch of size 32]
	[.comment_text]:[torch.LongTensor of size 100x32]


In [30]:
type(x)

torchtext.data.batch.Batch