https://github.com/PacktPublishing/Deep-Learning-with-PyTorch/tree/master/Chapter06

In [1]:
import sys
import torch
import torchtext

In [2]:
print(torch.__version__)
print(sys.version)

1.1.0
3.7.3 (default, Apr  3 2019, 05:39:12) 
[GCC 8.3.0]


In [3]:
sys.getdefaultencoding()

'utf-8'

## Training word embedding by building a sentiment classifier


In [4]:
TEXT  = torchtext.data.Field(lower = True, batch_first = True, fix_length = 40)
LABEL = torchtext.data.Field(sequential = False)

In [5]:
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL, root = "data")

In [6]:
type(train)

torchtext.datasets.imdb.IMDB

In [7]:
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

train.fields {'text': <torchtext.data.field.Field object at 0x7f5898059198>, 'label': <torchtext.data.field.Field object at 0x7f5898059860>}
len(train) 25000
vars(train[0]) {'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', '

In [8]:
# Zhuoer's Note
print("Number of words in each text varies, such as", [len(train[i].text) for i in range(5)])
print("But TEXT.fix_length tells iterator (loader) to only first", TEXT.fix_length, "words.")

Number of words in each text varies, such as [140, 428, 147, 124, 120]
But TEXT.fix_length tells iterator (loader) to only first 40 words.


In [9]:
vocab_vectors = torchtext.vocab.GloVe(name = '6B', dim = 300, cache = "data/vocab")
TEXT.build_vocab(train, vectors = vocab_vectors, max_size = 10000, min_freq = 10)
LABEL.build_vocab(train)
# note train and test share the same TEXT and LABEL fields, so we can creater iterator for both below

In [10]:
LABEL.vocab.freqs

Counter({'pos': 12500, 'neg': 12500})

In [11]:
vars(TEXT.vocab).keys()

dict_keys(['freqs', 'itos', 'stoi', 'vectors'])

In [12]:
print(TEXT.vocab.vectors)
print(TEXT.vocab.vectors.shape)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7724, -0.1800,  0.2072,  ...,  0.6736,  0.2263, -0.2919],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
torch.Size([10002, 300])


In [13]:
# Zhuoer's Note
print("GloVe:", len(vocab_vectors.itos), len(vocab_vectors.stoi), vocab_vectors.vectors.shape, vocab_vectors.dim)
print("max_size + 2(unk, pad),", len(TEXT.vocab.itos))

GloVe: 400000 400000 torch.Size([400000, 300]) 300
max_size + 2(unk, pad), 10002


In [14]:
# frist dataset is assumed to be train, and `shuffle`, `sort` is set accordingly
train_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, test), batch_sizes = (32, 128),  device = "cpu", repeat = False
)

In [15]:
# Zhuoer's Note
print("initial size: ", len(TEXT.vocab.stoi))
next(iter(train_iter))
print("Words not in the vocabulary gets ID 0, and is saved in stoi, so the latter's size grows.")
print("Load a batch: ", len(TEXT.vocab.stoi))

initial size:  10002
Words not in the vocabulary gets ID 0, and is saved in stoi, so the latter's size grows.
Load a batch:  10147


In [16]:
# Zhuoer's Note
review_sample = ' '.join(train[0].text)
    # create an Example from scratch
example = torchtext.data.Example()
example.text = review_sample.split(' ')
example.label = "pos"
    # create dataset of single item
train2 = torchtext.data.Dataset([example],[('text', TEXT),('label', LABEL)])
test2  = torchtext.data.Dataset([test[0]],[('text', TEXT),('label', LABEL)])

train_iter2, test_iter2 = torchtext.data.BucketIterator.splits(
    (train2,test2), batch_sizes = (1, 1),  device = "cpu", repeat = False
)

print("Raw sentences:\t", review_sample, '\n')
print("First 40 (see fix_length above) words are converted to indexs: \n\t", next(iter(train_iter2)).text, '\n')
print("And we can convert it back:\t", [TEXT.vocab.itos[i] for i in next(iter(train_iter2)).text.numpy()[0] ])

Raw sentences:	 bromwell high is a cartoon comedy. it ran at the same time as some other programs about school life, such as "teachers". my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is "teachers". the scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools i knew and their students. when i saw the episode in which a student repeatedly tried to burn down the school, i immediately recalled ......... at .......... high. a classic line: inspector: i'm here to sack one of your teachers. student: welcome to bromwell high. i expect that many adults of my age think that bromwell high is far fetched. what a pity that it isn't! 

First 40 (see fix_length above) words are converted to indexs: 
	 tensor([[   0,  317,    7,    3, 1309, 1299,   12, 2166,   29,    2,  163,   84,
           15,   45,   80, 8

In [17]:
# in this model, `torchtext.vocab.GloVe()` only provides a vocabulary, it builds embedding on it own.
# The vocabulary gives most frequent words integer IDs, while other words (such as "bromwell" in above) is treated as unknown with ID 0
class EmbNet(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim, input_len):
        super().__init__()
        self.embedding = torch.nn.Embedding(num_embeddings,embedding_dim)
        self.fc = torch.nn.Linear(embedding_dim*input_len, 3)
        
    def forward(self,x):
        embeds = self.embedding(x).view(x.size(0),-1)
        out = self.fc(embeds)
        return torch.nn.functional.log_softmax(out, dim = -1)

In [18]:
model = EmbNet(len(TEXT.vocab), 10, TEXT.fix_length)
model = model.cuda()

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [20]:
def fit(epoch, model, data_loader, optimizer, phase = 'training'):
    if phase == 'training':
        model.train()
    if phase == 'validation':
        model.eval()
    running_loss = 0.0
    running_correct = 0
    for batch_idx , batch in enumerate(data_loader):
        text , target = batch.text, batch.label
        text,target = text.cuda(),target.cuda()
        
        if phase == 'training':
            optimizer.zero_grad()
        output = model(text)
        loss = torch.nn.functional.nll_loss(output,target)
        
        running_loss += torch.nn.functional.nll_loss(output, target, reduction = "sum").data.item()
        preds = torch.argmax(output, dim = 1)
        running_correct += torch.sum(preds == target)
        if phase == 'training':
            loss.backward()
            optimizer.step()
    
    loss = running_loss/len(data_loader.dataset)
    accuracy = float(running_correct)/len(data_loader.dataset)
    
    print(f'{phase:>10}: loss is {loss:4.2f} and {phase:>10} accuracy is {running_correct}/{len(data_loader.dataset)} ({accuracy:.2%})')
    return loss,accuracy

In [21]:
train_losses, train_accuracy = [],[]
val_losses, val_accuracy = [],[]

for epoch in range(10):
    epoch_loss, epoch_accuracy = fit(epoch, model, train_iter, optimizer, phase = 'training')
    val_epoch_loss , val_epoch_accuracy = fit(epoch, model, test_iter, optimizer, phase = 'validation')
    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)
    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

  training: loss is 0.74 and   training accuracy is 13333/25000 (53.33%)
validation: loss is 0.70 and validation accuracy is 13738/25000 (54.95%)
  training: loss is 0.68 and   training accuracy is 14605/25000 (58.42%)
validation: loss is 0.68 and validation accuracy is 14769/25000 (59.08%)
  training: loss is 0.64 and   training accuracy is 15798/25000 (63.19%)
validation: loss is 0.64 and validation accuracy is 15705/25000 (62.82%)
  training: loss is 0.60 and   training accuracy is 16945/25000 (67.78%)
validation: loss is 0.62 and validation accuracy is 16526/25000 (66.10%)
  training: loss is 0.56 and   training accuracy is 17880/25000 (71.52%)
validation: loss is 0.61 and validation accuracy is 16813/25000 (67.25%)
  training: loss is 0.52 and   training accuracy is 18555/25000 (74.22%)
validation: loss is 0.60 and validation accuracy is 17222/25000 (68.89%)
  training: loss is 0.49 and   training accuracy is 19051/25000 (76.20%)
validation: loss is 0.60 and validation accuracy is

In [22]:
# Zhuoer's Note
print(next(iter(train_iter)).text.shape)
print(model.embedding(next(iter(train_iter)).text.cuda()).shape)

torch.Size([32, 40])
torch.Size([32, 40, 10])


## Using pretrained Glove word embeddings

In [23]:
class EmbNet2(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim, input_len):
        super().__init__()
        self.embedding = torch.nn.Embedding(num_embeddings, embedding_dim)
        self.fc = torch.nn.Linear(embedding_dim*input_len, 3)
        
    def forward(self,x):
        embeds = self.embedding(x).view(x.size(0),-1)
        out = self.fc(embeds)
        return torch.nn.functional.log_softmax(out, dim = -1)

In [24]:
model2 = EmbNet2(TEXT.vocab.vectors.shape[0], TEXT.vocab.vectors.shape[1], TEXT.fix_length)
model2 = model2.cuda()
model2

EmbNet2(
  (embedding): Embedding(10002, 300)
  (fc): Linear(in_features=12000, out_features=3, bias=True)
)

In [25]:
model2.embedding.weight.data = TEXT.vocab.vectors.cuda()

In [26]:
model2.embedding.weight.requires_grad = False

In [27]:
optimizer2 = torch.optim.SGD(model2.fc.parameters(), lr = 0.001)

In [28]:
for epoch in range(10):
    fit(epoch, model2, train_iter, optimizer2, phase='training')
    fit(epoch, model2, test_iter, optimizer2, phase='validation')

  training: loss is 0.69 and   training accuracy is 14329/25000 (57.32%)
validation: loss is 0.66 and validation accuracy is 15298/25000 (61.19%)
  training: loss is 0.65 and   training accuracy is 16020/25000 (64.08%)
validation: loss is 0.65 and validation accuracy is 15855/25000 (63.42%)
  training: loss is 0.62 and   training accuracy is 16659/25000 (66.64%)
validation: loss is 0.64 and validation accuracy is 16106/25000 (64.42%)
  training: loss is 0.61 and   training accuracy is 17179/25000 (68.72%)
validation: loss is 0.63 and validation accuracy is 16257/25000 (65.03%)
  training: loss is 0.60 and   training accuracy is 17444/25000 (69.78%)
validation: loss is 0.62 and validation accuracy is 16419/25000 (65.68%)
  training: loss is 0.59 and   training accuracy is 17643/25000 (70.57%)
validation: loss is 0.62 and validation accuracy is 16556/25000 (66.22%)
  training: loss is 0.58 and   training accuracy is 17821/25000 (71.28%)
validation: loss is 0.62 and validation accuracy is

## Recursive neural networks

In [29]:
TEXT3  = torchtext.data.Field(lower = True, batch_first = False, fix_length = 200)

train3, test3 = torchtext.datasets.IMDB.splits(TEXT3, LABEL, root = "data")

TEXT3.build_vocab(train3, vectors = vocab_vectors, max_size = 10000, min_freq = 10)

train_iter3, test_iter3 = torchtext.data.BucketIterator.splits(
    (train3, test3), batch_size = 32, device = "cpu", repeat = False
)

In [30]:
class IMDBRnn(torch.nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, num_classes, batch_size, num_layers = 2):
        super().__init__()
        self.e = torch.nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim, embedding_dim, num_layers)
        self.dropout = torch.nn.Dropout(p = 0.8)
        self.fc = torch.nn.Linear(embedding_dim, num_classes)
        
    def forward(self, input):
        input = self.e(input)
        input, _ = self.lstm(input) 
        input = self.dropout(input[-1]) # use the output of last layer
        input = self.fc(input)
        return torch.nn.functional.log_softmax(input, dim = -1)

In [31]:
model3 = IMDBRnn(len(TEXT3.vocab), 100, 3, 32)
model3 = model3.cuda()

optimizer3 = torch.optim.Adam(model3.parameters(),lr=0.001)

In [32]:
input = next(iter(train_iter3)).text
print("input:", input.shape)
print("output:", model3(input.cuda()).shape)
input = torch.nn.Embedding(len(TEXT3.vocab), 100)(input)
print("embeding:", input.shape)
input, _ = torch.nn.LSTM(100, 100, 2)(input)
print("LSTM:", input.shape)
input = torch.nn.Linear(100, 3)(input[-1])
print("Linear:", input.shape)

input: torch.Size([200, 32])
output: torch.Size([32, 3])
embeding: torch.Size([200, 32, 100])
LSTM: torch.Size([200, 32, 100])
Linear: torch.Size([32, 3])


In [33]:
for epoch in range(10):
    fit(epoch, model3, train_iter3, optimizer3, phase = 'training')
    fit(epoch, model3, test_iter3,  optimizer3, phase = 'validation')
    # after ~5 epochs, the model overfit since validation loss increases

  training: loss is 0.71 and   training accuracy is 12492/25000 (49.97%)
validation: loss is 0.69 and validation accuracy is 12500/25000 (50.00%)
  training: loss is 0.70 and   training accuracy is 12586/25000 (50.34%)
validation: loss is 0.70 and validation accuracy is 12500/25000 (50.00%)
  training: loss is 0.70 and   training accuracy is 12488/25000 (49.95%)
validation: loss is 0.69 and validation accuracy is 12504/25000 (50.02%)
  training: loss is 0.69 and   training accuracy is 12768/25000 (51.07%)
validation: loss is 0.69 and validation accuracy is 12500/25000 (50.00%)
  training: loss is 0.69 and   training accuracy is 13574/25000 (54.30%)
validation: loss is 0.71 and validation accuracy is 14164/25000 (56.66%)
  training: loss is 0.56 and   training accuracy is 18336/25000 (73.34%)
validation: loss is 0.50 and validation accuracy is 19307/25000 (77.23%)
  training: loss is 0.43 and   training accuracy is 20433/25000 (81.73%)
validation: loss is 0.45 and validation accuracy is

## Convolutional network on sequence data

In [34]:
TEXT4  = torchtext.data.Field(lower = True, batch_first = True, fix_length = 200)

train4, test4 = torchtext.datasets.IMDB.splits(TEXT4, LABEL, root = "data")

TEXT4.build_vocab(train4, vectors = vocab_vectors, max_size = 10000, min_freq = 10)

train_iter4, test_iter4 = torchtext.data.BucketIterator.splits(
    (train4, test4), batch_size = 32, device = "cpu", repeat = False
)

In [35]:
class IMDBCovn(torch.nn.Module):
    # the original code doesn't transpose(), rather uses Conv1d(fix_length, 100, 3), the result looks similar (or even better)
    # but I think in_channels should be length of embedding vector, rather than number of words
    def __init__(self, num_embeddings, embedding_dim, num_classes, batch_size, kernel_size = 3):
        super().__init__()
        out_channels = embedding_dim
        self.e = torch.nn.Embedding(num_embeddings, embedding_dim)
        self.conv = torch.nn.Conv1d(embedding_dim, out_channels, kernel_size)
        self.avgpool = torch.nn.AdaptiveAvgPool1d(10)
        self.dropout = torch.nn.Dropout(p = 0.95)
        self.fc = torch.nn.Linear(out_channels*10, num_classes)
        self.softmax = torch.nn.LogSoftmax(dim = -1)
        
    def forward(self, input):
        input = self.e(input)
        input = self.conv(input.transpose(1, 2)) 
        input = self.avgpool(input)
        input = self.dropout(input.view(input.shape[0], -1))
        input = self.fc(input)
        return self.softmax(input)

In [36]:
model4 = IMDBCovn(len(TEXT4.vocab), 100, 3, 32)
model4 = model4.cuda()

optimizer4 = torch.optim.Adam(model4.parameters(), lr = 0.001)

In [37]:
input = next(iter(train_iter4)).text
print("input:", input.shape)
print("output:", model4(input.cuda()).shape)
input = torch.nn.Embedding(len(TEXT4.vocab), 100)(input)
print("embeding:", input.shape)
input = torch.nn.Conv1d(100, 100, 3)(input.transpose(1, 2))
print("Conv1d:", input.shape)
input = torch.nn.AdaptiveAvgPool1d(10)(input)
print("Pool:", input.shape)

input: torch.Size([32, 200])
output: torch.Size([32, 3])
embeding: torch.Size([32, 200, 100])
Conv1d: torch.Size([32, 100, 198])
Pool: torch.Size([32, 100, 10])


In [38]:
for epoch in range(10):
    fit(epoch, model4, train_iter4, optimizer4, phase = 'training')
    fit(epoch, model4, test_iter4,  optimizer4, phase = 'validation')

  training: loss is 0.77 and   training accuracy is 13450/25000 (53.80%)
validation: loss is 0.61 and validation accuracy is 16996/25000 (67.98%)
  training: loss is 0.62 and   training accuracy is 16922/25000 (67.69%)
validation: loss is 0.47 and validation accuracy is 19373/25000 (77.49%)
  training: loss is 0.50 and   training accuracy is 19197/25000 (76.79%)
validation: loss is 0.41 and validation accuracy is 20286/25000 (81.14%)
  training: loss is 0.44 and   training accuracy is 20201/25000 (80.80%)
validation: loss is 0.39 and validation accuracy is 20512/25000 (82.05%)
  training: loss is 0.39 and   training accuracy is 20881/25000 (83.52%)
validation: loss is 0.36 and validation accuracy is 20987/25000 (83.95%)
  training: loss is 0.36 and   training accuracy is 21202/25000 (84.81%)
validation: loss is 0.37 and validation accuracy is 20943/25000 (83.77%)
  training: loss is 0.34 and   training accuracy is 21470/25000 (85.88%)
validation: loss is 0.37 and validation accuracy is