In [36]:
from torchtext import data, datasets
import torch
import numpy  as np
from torch import nn, optim

In [8]:
questions =  data.Field(tokenize = 'spacy', batch_first = True) # uses spacy for tokenization
labels = data.LabelField(dtype = torch.float) # sets as floats so we can caluclate gradients.



In [27]:
train_data, _ = datasets.TREC.splits(questions, labels)
train_data, valid_data = train_data.split()
print(len(train_data), len(valid_data))

3816 1636




In [28]:
# visualise sample
ith = np.random.randint(len(train_data))
print(train_data.examples[ith].text)
print(train_data.examples[ith].label)

['Who', 'loved', 'Flash', 'Gordon', 'besides', 'Dale', '?']
HUM


In [31]:
questions.build_vocab(train_data, vectors="glove.6B.200d", unk_init=torch.Tensor.normal_)
labels.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [11:01, 1.30MB/s]                               
100%|█████████▉| 399807/400000 [00:40<00:00, 14002.80it/s]

In [32]:
questions.vocab.vectors

tensor([[-0.3823,  0.4517, -0.6410,  ...,  0.0282, -1.8913, -1.8959],
        [-1.2393, -1.8900, -1.1671,  ..., -1.2466,  0.1187,  1.9907],
        [ 0.3911,  0.4019, -0.1505,  ..., -0.0348,  0.0798,  0.5031],
        ...,
        [ 0.1940, -0.4843, -0.7601,  ...,  0.3863, -0.6567, -0.0112],
        [-0.0364, -0.4738, -0.1913,  ...,  0.4871, -0.0841,  0.5200],
        [ 0.5741, -0.4343, -0.1119,  ...,  0.7629,  0.3831, -0.1570]])

In [34]:
# create data iterators
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = 64,
    device = device
)




In [37]:
class CNN(nn.Module):
    def __init__(self,
                 vocab_size, 
                 embedding_dim, 
                 n_filters, 
                 filter_sizes, 
                 output_dim, 
                 dropout, 
                 pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=embedding_dim, 
                                      padding_idx=pad_idx) # padding_index ->index of embeddign used to pad to same length, define later manually
        self.convs = nn.ModuleList( # define a list of convs layers
            [nn.Conv2D(in_channels=1, # input text does not have channels
                       out_channels=n_filters, # how many conv filters with same shape to train (same for all filter_sizes) 
                       kernel_size=(fs, embedding_dim)
                      ) 
             for fs in filter_sizes]
        )
        
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout= nn.Dropout(dropout)
        
        
    def forward(self, text):
        emb = self.embedding(text).unsqueeze(1)
        conved = [F.relu(conv2d(emb)).squeeze(3) for conv2d in self.convs]
        pooled = [F.mac_pool1d(c , c.shape[2]).squeeze(2) for c in conved]
        concat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(concat)
    