In [1]:
!pip install torchtext==0.6.0
!pip install spacy==2.2.3
!python -m spacy download en

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.6 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
import torch
import torch.nn as nn
import spacy
import torchtext.data as ttd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [3]:
# Let make some fake data!
data = {
    "label": [0, 1, 1],
    "data": [
        "I like eggs and ham.",
        "Eggs I like!",
        "Ham and eggs or just ham?",
    ]
}

In [4]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,label,data
0,0,I like eggs and ham.
1,1,Eggs I like!
2,1,Ham and eggs or just ham?


In [5]:
df.to_csv("data.csv", index=False)

In [6]:
!head data.csv

label,data
0,I like eggs and ham.
1,Eggs I like!
1,Ham and eggs or just ham?


In [7]:
TEXT = ttd.Field( 
    sequential=True,
    batch_first=True,
    lower=True,
    tokenize="spacy",
    pad_first=True)
LABEL = ttd.Field(sequential=False, use_vocab=False, is_target=True)  

dataset = ttd.TabularDataset( 
    path="data.csv",
    format="csv",
    skip_header=True,
    fields=[("label", LABEL), ("data", TEXT)])

In [8]:
ex = dataset.examples[0]

In [9]:
type(ex)

torchtext.data.example.Example

In [10]:
ex.data

['i', 'like', 'eggs', 'and', 'ham', '.']

In [11]:
ex.label

'0'

In [12]:
train_dataset, test_dataset = dataset.split(0.66) # default is 0.7

In [13]:
TEXT.build_vocab(train_dataset)

In [14]:
vocab = TEXT.vocab
type(vocab)

torchtext.vocab.Vocab

In [15]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f5d3c199550>>,
            {'<unk>': 0,
             '<pad>': 1,
             'ham': 2,
             'and': 3,
             'eggs': 4,
             '.': 5,
             '?': 6,
             'i': 7,
             'just': 8,
             'like': 9,
             'or': 10})

In [16]:
vocab.itos

['<unk>', '<pad>', 'ham', 'and', 'eggs', '.', '?', 'i', 'just', 'like', 'or']

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [18]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset, test_dataset), sort_key=lambda x: len(x.data),
    batch_sizes=(2, 2), device=device)

In [19]:
for inputs, targets in train_iter:
    print("inputs:", inputs, "shape:", inputs.shape)
    print("targets:", targets, "shape:", targets.shape)
    break

inputs: tensor([[ 2,  3,  4, 10,  8,  2,  6],
        [ 1,  7,  9,  4,  3,  2,  5]]) shape: torch.Size([2, 7])
targets: tensor([1, 0]) shape: torch.Size([2])


In [20]:
for inputs, targets in test_iter:
    print("inputs:", inputs, "shape:", inputs.shape)
    print("targets:", targets, "shape:", targets.shape)
    break

inputs: tensor([[4, 7, 9, 0]]) shape: torch.Size([1, 4])
targets: tensor([1]) shape: torch.Size([1])
