In [1]:
import numpy as np
import spacy
import tqdm
from torch.utils.data import Dataset
import pandas
import torch
import sklearn.metrics
import torch.utils.data



 That one hot + neural network model did not work very well!
 So now for a different
 technique that treats text as a sequence,
 this will involve recurrent
 networks, using a particular kind called an LSTM.

In [2]:
nlp = spacy.load('en_core_web_lg')

 Using the same dataset of sentiments on movie reviews, we
 will use a pre-trained language model from spacy.
 Using wikipedia, spacy comes pretrained with word vectors,
 which are dense encodings, so instead of one hot encoding,
 we use the word vector.
 The nice thing about this is we actually do less
 work to set up our data AND our model starts
  with knowledge from the language
 model built over wikipedia.
 Here is an example word vector:

In [3]:
for token in nlp('hello'):
    print(token)
    print(token.vector)
    print(token.vector.shape)

hello
[ 2.2407e+00  1.0389e+00  1.3092e+00 -1.7335e+00 -7.8466e-01 -2.9269e-01
 -1.8059e+00 -2.5223e+00  7.8025e-01  2.4899e+00 -9.1849e-02  2.8755e-01
 -1.5057e+00  2.6337e+00  2.5252e+00 -2.2432e-01 -2.2068e+00 -5.7895e-01
 -5.6551e-01 -1.9338e+00  1.4973e+00  8.5889e-01  3.3559e+00 -3.7527e+00
  2.2585e-01 -1.6969e-01  5.1389e-01  4.6073e-01 -2.8248e-01 -2.6048e+00
 -3.5896e+00 -1.0971e+00 -1.5517e+00 -1.2185e-01  2.8633e+00 -1.2525e+00
 -1.6924e+00 -2.2917e+00  9.7793e-01  4.6954e-01 -3.5950e+00 -1.7357e-01
  9.8050e-01 -1.8044e+00 -7.2183e-01 -4.0709e-01 -3.0943e+00  1.3095e-01
 -2.9015e+00  1.4768e+00 -1.0588e+00 -2.8123e+00  1.2936e+00 -7.5977e-03
  2.9975e+00 -2.4438e+00  1.2348e-01  1.8322e+00  3.5869e-01 -1.8335e-02
  1.9534e+00  1.4417e+00  9.9895e-01 -2.8209e+00 -7.5846e-01 -1.8438e+00
 -3.2658e+00 -4.6574e-01  9.0322e-01  7.9868e-01 -1.6134e+00 -3.3082e-01
  1.1541e+00 -4.7334e+00  1.4964e+00 -2.4014e+00 -1.3461e+00 -9.5551e-01
  2.9671e-01 -1.4506e+00 -8.7128e-01 -3.0714e

 And for the dataset, we just extract the vectors as tensors
 and return the length of each string in tokens.
 This is important for working with pytorch recurrent networks.

In [4]:
class SentimentDataset(Dataset):
    def __init__(self):
        self.data = pandas \
            .read_csv('C:\github\pytorch-dl7\sentiment.tsv', sep='\t', header=0) \
            .groupby('SentenceId') \
            .first()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if type(idx) is torch.Tensor:
            idx = idx.item()
        sample = self.data.iloc[idx]
        token_vectors = []
        # switching off NER for a tiny speed boost
        for token in nlp(sample.Phrase.lower(), disable=['ner']):
            token_vectors.append(token.vector)

        # tokens and length as inputs -- the length
        # is needed to 'pack' variable length sequences
        # output is the sentiment score 
        return (torch.tensor(token_vectors),
                torch.tensor(len(token_vectors)),
                torch.tensor(sample.Sentiment))


sentiment = SentimentDataset()
sentiment[0]

  return (torch.tensor(token_vectors),


(tensor([[ -9.3629,   9.2761,  -7.2708,  ...,   2.6801,  -6.8160,   3.5737],
         [ -2.9056,  -2.5910,   3.3317,  ...,  -7.2864,  -3.5043,  -2.2919],
         [-12.6670,  -6.5680,  -0.6154,  ...,  -8.0021,  -0.3171,  -7.7062],
         ...,
         [ -9.3629,   9.2761,  -7.2708,  ...,   2.6801,  -6.8160,   3.5737],
         [  2.7879,  -2.6493,  -0.4601,  ...,  -0.6575,  -5.5132,   0.4322],
         [ -0.0765,  -4.6896,  -4.0431,  ...,   1.3040,  -0.5270,  -1.3622]]),
 tensor(37),
 tensor(1))

In [5]:
# break this into a training and testing dataset, and need
# to collate into fixed width as these will be
# variable batches
def collate(batch):
    # sort indescending length order -- this is needed for
    # padding seqeunces in pytorch
    batch.sort(key=lambda x: x[1], reverse=True)
    sequences, lengths, sentiments = zip(*batch)
    sequences = torch.nn.utils.rnn.pad_sequence(
        sequences, batch_first=True
    )
    sentiments = torch.stack(sentiments)
    lengths = torch.stack(lengths)
    return sequences, lengths, sentiments


number_for_testing = int(len(sentiment) * 0.05)
number_for_training = len(sentiment) - number_for_testing
train, test = torch.utils.data.random_split(sentiment,
                                            [number_for_training, number_for_testing])
trainloader = torch.utils.data.DataLoader(
    train, batch_size=32, shuffle=True,
    collate_fn=collate)
testloader = torch.utils.data.DataLoader(
    test, batch_size=32, shuffle=True,
    collate_fn=collate)

# take a peek and see what we are collating
for batch in trainloader:
    print(batch[0].shape, batch[1].shape, batch[2].shape)
    # what is the max length?
    print(batch[1][0])
    break

torch.Size([32, 38, 300]) torch.Size([32]) torch.Size([32])
tensor(38)


 Now, this is still a regression problem, but instead of
 one hot encoded words and a plain nerual network,
 we will have sequences
 of word vectors, from the learned wikipedia model.
 These sequences in turn will be * packed * which
 is because they all have different lengths,
 run through the recurrent network
 which loops word vector by word vector to compute a final
 numerical representation of the whole sequence - -just like
 reading - -word for word in order.
 This is why we need the sequence lengths, you need to know
 the boundaries on which to pack.

In [6]:
class Model(torch.nn.Module):

    def __init__(self, input_dimensions, size=128, layers=1):
        super().__init__()
        self.seq = torch.nn.LSTM(input_dimensions, size, layers)
        self.layer_one = torch.nn.Linear(size * layers, size)
        self.activation_one = torch.nn.ReLU()
        self.layer_two = torch.nn.Linear(size, size)
        self.activation_two = torch.nn.ReLU()
        self.shape_outputs = torch.nn.Linear(size, 5)

    def forward(self, inputs, lengths):
        # need to sort the sequences for pytorch -- which we
        # did in our collation above
        number_of_batches = lengths.shape[0]
        packed_inputs = torch.nn.utils.rnn.pack_padded_sequence(
            inputs,
            lengths,
            batch_first=True)
        buffer, (hidden, cell) = self.seq(packed_inputs)
        # batch first...
        buffer = hidden.permute(1, 0, 2)
        # flatten out the last hidden state -- this will
        # be the tensor representing each batch
        buffer = buffer.contiguous().view(number_of_batches, -1)
        # and feed along to a simple output network with
        # a single output cell for regression
        buffer = self.layer_one(buffer)
        buffer = self.activation_one(buffer)
        buffer = self.layer_two(buffer)
        buffer = self.activation_two(buffer)
        buffer = self.shape_outputs(buffer)
        return buffer


# get the input dimensions from the first sample
# encodings are word, vectors - so index 1 at the end
model = Model(sentiment[0][0].shape[1])

In [7]:
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(64):
    losses = []
    for sequences, lengths, sentiments in tqdm.tqdm(trainloader):
        optimizer.zero_grad()
        results = model(sequences, lengths)
        loss = loss_function(results, sentiments)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print("Loss: {0}".format(torch.tensor(losses).mean()))

100%|██████████| 254/254 [00:46<00:00,  5.51it/s]


Loss: 1.4734059572219849


100%|██████████| 254/254 [00:45<00:00,  5.60it/s]


Loss: 1.3330905437469482


100%|██████████| 254/254 [00:46<00:00,  5.49it/s]


Loss: 1.2441843748092651


100%|██████████| 254/254 [00:46<00:00,  5.51it/s]


Loss: 1.1678261756896973


100%|██████████| 254/254 [00:45<00:00,  5.52it/s]


Loss: 1.0837353467941284


100%|██████████| 254/254 [00:46<00:00,  5.47it/s]


Loss: 0.9996069073677063


100%|██████████| 254/254 [00:47<00:00,  5.36it/s]


Loss: 0.9185651540756226


100%|██████████| 254/254 [00:44<00:00,  5.76it/s]


Loss: 0.8219011425971985


100%|██████████| 254/254 [00:43<00:00,  5.83it/s]


Loss: 0.7521815299987793


100%|██████████| 254/254 [00:43<00:00,  5.84it/s]


Loss: 0.6641423106193542


100%|██████████| 254/254 [00:45<00:00,  5.55it/s]


Loss: 0.5820595026016235


100%|██████████| 254/254 [00:46<00:00,  5.41it/s]


Loss: 0.5267065167427063


100%|██████████| 254/254 [00:46<00:00,  5.49it/s]


Loss: 0.4469405710697174


100%|██████████| 254/254 [00:47<00:00,  5.40it/s]


Loss: 0.4057372808456421


100%|██████████| 254/254 [00:46<00:00,  5.42it/s]


Loss: 0.37197065353393555


100%|██████████| 254/254 [00:45<00:00,  5.54it/s]


Loss: 0.35849249362945557


100%|██████████| 254/254 [00:47<00:00,  5.30it/s]


Loss: 0.29713374376296997


100%|██████████| 254/254 [00:46<00:00,  5.44it/s]


Loss: 0.25557243824005127


100%|██████████| 254/254 [00:45<00:00,  5.61it/s]


Loss: 0.25849229097366333


100%|██████████| 254/254 [00:45<00:00,  5.57it/s]


Loss: 0.2347264587879181


100%|██████████| 254/254 [00:43<00:00,  5.78it/s]


Loss: 0.20148402452468872


100%|██████████| 254/254 [00:43<00:00,  5.83it/s]


Loss: 0.19072909653186798


100%|██████████| 254/254 [00:43<00:00,  5.81it/s]


Loss: 0.20309260487556458


100%|██████████| 254/254 [00:43<00:00,  5.83it/s]


Loss: 0.16435274481773376


100%|██████████| 254/254 [00:43<00:00,  5.82it/s]


Loss: 0.1608603596687317


100%|██████████| 254/254 [00:50<00:00,  5.04it/s]


Loss: 0.1625564992427826


100%|██████████| 254/254 [00:49<00:00,  5.09it/s]


Loss: 0.13566236197948456


100%|██████████| 254/254 [00:49<00:00,  5.18it/s]


Loss: 0.1588263362646103


100%|██████████| 254/254 [00:49<00:00,  5.13it/s]


Loss: 0.15672093629837036


100%|██████████| 254/254 [00:49<00:00,  5.14it/s]


Loss: 0.14521686732769012


100%|██████████| 254/254 [00:51<00:00,  4.98it/s]


Loss: 0.1399264931678772


100%|██████████| 254/254 [00:48<00:00,  5.22it/s]


Loss: 0.13188566267490387


100%|██████████| 254/254 [00:44<00:00,  5.69it/s]


Loss: 0.1885973960161209


100%|██████████| 254/254 [00:44<00:00,  5.75it/s]


Loss: 0.12250088155269623


100%|██████████| 254/254 [00:44<00:00,  5.67it/s]


Loss: 0.08807788789272308


100%|██████████| 254/254 [00:44<00:00,  5.74it/s]


Loss: 0.0928792729973793


100%|██████████| 254/254 [00:44<00:00,  5.74it/s]


Loss: 0.13396723568439484


100%|██████████| 254/254 [00:44<00:00,  5.77it/s]


Loss: 0.1313169151544571


100%|██████████| 254/254 [00:44<00:00,  5.75it/s]


Loss: 0.10199742019176483


100%|██████████| 254/254 [00:44<00:00,  5.69it/s]


Loss: 0.06540792435407639


100%|██████████| 254/254 [00:44<00:00,  5.77it/s]


Loss: 0.06976909935474396


100%|██████████| 254/254 [00:43<00:00,  5.79it/s]


Loss: 0.14090768992900848


100%|██████████| 254/254 [00:43<00:00,  5.79it/s]


Loss: 0.13151879608631134


100%|██████████| 254/254 [00:43<00:00,  5.79it/s]


Loss: 0.09350687265396118


100%|██████████| 254/254 [00:43<00:00,  5.83it/s]


Loss: 0.1081337183713913


100%|██████████| 254/254 [00:43<00:00,  5.81it/s]


Loss: 0.07678601890802383


100%|██████████| 254/254 [00:43<00:00,  5.82it/s]


Loss: 0.11498833447694778


100%|██████████| 254/254 [00:43<00:00,  5.81it/s]


Loss: 0.10333216190338135


100%|██████████| 254/254 [00:43<00:00,  5.79it/s]


Loss: 0.07117589563131332


100%|██████████| 254/254 [00:43<00:00,  5.80it/s]


Loss: 0.04350518807768822


100%|██████████| 254/254 [00:43<00:00,  5.81it/s]


Loss: 0.08667664229869843


100%|██████████| 254/254 [00:43<00:00,  5.79it/s]


Loss: 0.10806074738502502


100%|██████████| 254/254 [00:43<00:00,  5.79it/s]


Loss: 0.11067166179418564


100%|██████████| 254/254 [11:12:08<00:00, 158.77s/it]      


Loss: 0.08212721347808838


100%|██████████| 254/254 [00:44<00:00,  5.76it/s]


Loss: 0.0676676332950592


100%|██████████| 254/254 [00:43<00:00,  5.85it/s]


Loss: 0.05687376484274864


100%|██████████| 254/254 [00:44<00:00,  5.77it/s]


Loss: 0.09348174929618835


100%|██████████| 254/254 [00:49<00:00,  5.10it/s]


Loss: 0.08590392768383026


100%|██████████| 254/254 [00:49<00:00,  5.11it/s]


Loss: 0.10759774595499039


100%|██████████| 254/254 [00:47<00:00,  5.37it/s]


Loss: 0.09520349651575089


100%|██████████| 254/254 [00:46<00:00,  5.41it/s]


Loss: 0.05474971607327461


100%|██████████| 254/254 [00:50<00:00,  5.08it/s]


Loss: 0.05004001408815384


100%|██████████| 254/254 [00:47<00:00,  5.31it/s]


Loss: 0.06291337311267853


100%|██████████| 254/254 [00:47<00:00,  5.31it/s]

Loss: 0.11179642379283905





In [8]:
results_buffer=[]
actual_buffer=[]
with torch.no_grad():
    model.eval()
    for test_seq, test_len, test_sentiment in testloader:
        results=model(test_seq, test_len).argmax(dim=1).numpy()
        results_buffer.append(results)
        actual_buffer.append(test_sentiment)

print(sklearn.metrics.classification_report(
    np.concatenate(actual_buffer),
    np.concatenate(results_buffer)))

              precision    recall  f1-score   support

           0       0.35      0.33      0.34        48
           1       0.44      0.43      0.43       112
           2       0.32      0.31      0.31        91
           3       0.42      0.48      0.45       112
           4       0.51      0.43      0.47        63

    accuracy                           0.41       426
   macro avg       0.41      0.40      0.40       426
weighted avg       0.41      0.41      0.41       426

