<a href="https://colab.research.google.com/github/comchem/Pytorch_DeepLearning_Tutorials/blob/master/Sec7-NLP_ANN/1-Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import torch
import torch.nn as nn
import torchtext.legacy.data as ttd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [5]:
# Let's make some fake data!
data = {
    "label": [0,1,1],
    "data": [
             "I like eggs and ham",
             "Eggs I like!",
             "Ham and eggs or just ham?"
    ]
}

In [6]:
df = pd.DataFrame(data)

In [7]:
df.head()

Unnamed: 0,label,data
0,0,I like eggs and ham
1,1,Eggs I like!
2,1,Ham and eggs or just ham?


In [8]:
df.to_csv('thedata.csv', index=False)

In [9]:
!cat thedata.csv

label,data
0,I like eggs and ham
1,Eggs I like!
1,Ham and eggs or just ham?


In [10]:
!head thedata.csv

label,data
0,I like eggs and ham
1,Eggs I like!
1,Ham and eggs or just ham?


In [11]:
TEXT = ttd.Field(
    sequential = True,
    batch_first = True,
    lower = True,
    tokenize = 'spacy',
    pad_first = True
)
LABEL = ttd.Field(sequential = False, use_vocab = False, is_target = True)

# Note: if you don't specify use_vocab = False, then Pytorch will complain later when you try 
# to iterate over the dataset that the attribute 'vocab' doesn't exist.

# Note 2: if you don't specify is_target = True, then Pytorch will assume it's part of the input,
# so when you iterate over the dataset it will be like:
# for (inputs, targets), _ in iterator:
# where the 2nd element (_) should have been the target.

dataset = ttd.TabularDataset(
    path = 'thedata.csv',
    format = 'csv',
    skip_header = True,
    fields = [('label', LABEL), ('data', TEXT)]
)

In [12]:
ex = dataset.examples[2]

In [13]:
type(ex)

torchtext.legacy.data.example.Example

In [14]:
ex.data

['ham', 'and', 'eggs', 'or', 'just', 'ham', '?']

In [15]:
ex.label

'1'

In [16]:
train_dataset, test_dataset = dataset.split(0.66)  # default is 0.7 --> train & test set split

In [17]:
TEXT.build_vocab(train_dataset)

In [18]:
vocab = TEXT.vocab
type(vocab)

torchtext.vocab.Vocab

In [19]:
vocab.stoi   # Dictionary type word embedding

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f251985ca50>>,
            {'!': 4,
             '<pad>': 1,
             '<unk>': 0,
             '?': 5,
             'and': 6,
             'eggs': 2,
             'ham': 3,
             'i': 7,
             'just': 8,
             'like': 9,
             'or': 10})

In [21]:
vocab.itos    # List type word embedding

['<unk>', '<pad>', 'eggs', 'ham', '!', '?', 'and', 'i', 'just', 'like', 'or']

In [22]:
# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [23]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset), sort_key = lambda x: len(x.data),
    batch_sizes = (2,2), device = device
)

In [30]:
train_iter.train

True

In [None]:
for inputs, targets in train_iter:
  print("inputs:", inputs, "shape:", inputs.shape)
  print("targets:", targets, "shape:", targets.shape)
  break