In [1]:
import torch
from torch import autograd
from torch import nn
from torch import optim
import torch.nn.functional as F

torch.manual_seed(1)

<torch._C.Generator at 0x7a576a23fe70>

# 1. Introduction to Torch's tensor library

In [2]:
# creating tensors

V_data = [1., 2., 3.]
V = torch.Tensor(V_data)
print(V)

# create a matrix

M_data = [[1., 2., 3.], [4., 5., 6.]]
M = torch.Tensor(M_data)
print(M)

# create 3d tensor
T_data = [[[1., 2.], [3., 4.],
           [5., 6.], [7., 8.]]]
T = torch.Tensor(T_data)
print(T)

tensor([1., 2., 3.])
tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([[[1., 2.],
         [3., 4.],
         [5., 6.],
         [7., 8.]]])


In [3]:
tmp_tensor = torch.Tensor([[1, 2, 3, 4], [0, 0, 0, 0]])
print(tmp_tensor.shape)
tmp_tensor = torch.Tensor([[[1, 2, 3],
                            [0, 0, 0]]])
print(tmp_tensor.shape)

torch.Size([2, 4])
torch.Size([1, 2, 3])


In [4]:
print(V[0])
print(M[0])
print(T[0])

tensor(1.)
tensor([1., 2., 3.])
tensor([[1., 2.],
        [3., 4.],
        [5., 6.],
        [7., 8.]])


In [5]:
x = torch.randn(3, 4, 5)
print(x)

tensor([[[-1.5256, -0.7502, -0.6540, -1.6095, -0.1002],
         [-0.6092, -0.9798, -1.6091, -0.7121,  0.3037],
         [-0.7773, -0.2515, -0.2223,  1.6871,  0.2284],
         [ 0.4676, -0.6970, -1.1608,  0.6995,  0.1991]],

        [[ 0.8657,  0.2444, -0.6629,  0.8073,  1.1017],
         [-0.1759, -2.2456, -1.4465,  0.0612, -0.6177],
         [-0.7981, -0.1316,  1.8793, -0.0721,  0.1578],
         [-0.7735,  0.1991,  0.0457,  0.1530, -0.4757]],

        [[-0.1110,  0.2927, -0.1578, -0.0288,  0.4533],
         [ 1.1422,  0.2486, -1.7754, -0.0255, -1.0233],
         [-0.5962, -1.0055,  0.4285,  1.4761, -1.7869],
         [ 1.6103, -0.7040, -0.1853, -0.9962, -0.8313]]])


In [6]:
# tensor operations
x = torch.tensor([1, 2, 3], dtype=torch.float32)
y = torch.tensor([4, 5, 6], dtype=torch.float32)
z = x + y
z

tensor([5., 7., 9.])

In [7]:
x_1 = torch.randn(2, 5)
y_1 = torch.randn(3, 5)
z_1 = torch.cat([x_1, y_1])
print(z_1.shape)

x_2 = torch.randn(2, 6)
y_2 = torch.randn(2, 8)
z_2 = torch.cat([x_2, y_2], 1)
print(z_2.shape)

torch.Size([5, 5])
torch.Size([2, 14])


In [8]:
# reshaping tensor
x = torch.randn(2, 3, 4)
print(x.shape)
print(x.view(2, 12).shape)
print(x.view(2, -1).shape)

torch.Size([2, 3, 4])
torch.Size([2, 12])
torch.Size([2, 12])


# 2. Computation Graphs and Automatic Differentiation

In [9]:
# Variables wrap tensor objects
x = autograd.Variable(torch.Tensor([1., 2., 3.]), requires_grad=True)
print(x.data)

# also can do tensor operation
y = autograd.Variable(torch.Tensor([4., 5., 6.]), requires_grad=True)
z = x + y
print(z.data)

# but z knows something extra
print(z.grad_fn)

tensor([1., 2., 3.])
tensor([5., 7., 9.])
<AddBackward0 object at 0x7a56a1cdb8e0>


In [10]:
# Lets sum up all the entries in z
s = z.sum()
print(s)
print(s.grad_fn)

tensor(21., grad_fn=<SumBackward0>)
<SumBackward0 object at 0x7a56a1cdb490>


In [11]:
s.backward()
print(x.grad)

tensor([1., 1., 1.])


In [12]:
x = torch.randn((2, 2))
y = torch.randn((2, 2))
z = x + y

var_x = autograd.Variable(x, requires_grad=True)
var_y = autograd.Variable(y, requires_grad=True)
var_z = var_x + var_y
print(var_z.grad_fn)

var_z_data = var_z.data
new_var_z = autograd.Variable(var_z_data, requires_grad=True)

print(new_var_z.grad_fn)

<AddBackward0 object at 0x7a576a261240>
None


# 3. Deep Learning Building Blocks: Affine maps, non-linearities and objectives

In [13]:
lin = nn.Linear(5, 3) # maps from R^5 to R^3, parameters A, b
data = autograd.Variable( torch.randn(2, 5) ) # data is 2x5.  A maps from 5 to 3... can we map "data" under A?
print(lin(data)) # yes

tensor([[ 0.4724,  0.2742,  0.9672],
        [-0.2771, -0.2918,  0.4074]], grad_fn=<AddmmBackward0>)


In [14]:
data = autograd.Variable(torch.randn(2, 2), requires_grad=True)
print(data)
print(F.relu(data))

tensor([[-1.4105, -0.3404],
        [-3.0121,  0.5710]], requires_grad=True)
tensor([[0.0000, 0.0000],
        [0.0000, 0.5710]], grad_fn=<ReluBackward0>)


In [15]:
# softmax
data = autograd.Variable(torch.randn(5), requires_grad=True)
print(data)
print(F.softmax(data, 0))
print(F.softmax(data, 0).sum())
print(F.log_softmax(data, 0))

tensor([ 1.4330,  1.6689,  1.8068, -0.6527,  1.0488], requires_grad=True)
tensor([0.2210, 0.2798, 0.3212, 0.0275, 0.1505], grad_fn=<SoftmaxBackward0>)
tensor(1.0000, grad_fn=<SumBackward0>)
tensor([-1.5095, -1.2736, -1.1357, -3.5952, -1.8937],
       grad_fn=<LogSoftmaxBackward0>)


# 4. Optimization and Training

In [16]:
"""
there is no code in this section just reading
"""

'\nthere is no code in this section just reading\n'

# 5. Creating Network Components in Pytorch

In [17]:
data = [ ("me gusta comer en la cafeteria".split(), "SPANISH"),
         ("Give it to me".split(), "ENGLISH"),
         ("No creo que sea una buena idea".split(), "SPANISH"),
         ("No it is not a good idea to get lost at sea".split(), "ENGLISH") ]

test_data = [ ("Yo creo que si".split(), "SPANISH"),
              ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

print(word_to_ix)
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


In [19]:
class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()

        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec))

In [23]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

In [21]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.0929,  0.0079, -0.0402,  0.0651,  0.1697,  0.0579, -0.0632, -0.0962,
         -0.1710,  0.1650, -0.0372,  0.0396,  0.0073, -0.1250,  0.1104,  0.1099,
          0.0099, -0.1115, -0.0833,  0.0027, -0.1120, -0.1094, -0.0293, -0.0565,
          0.0481, -0.0515],
        [-0.0260, -0.0749, -0.1792,  0.1710,  0.0374,  0.1754, -0.0316, -0.0493,
         -0.1844, -0.0744,  0.1286, -0.1921, -0.0686,  0.1195,  0.1130,  0.0724,
         -0.0388, -0.0148, -0.0372, -0.0723,  0.0818, -0.0668, -0.1102,  0.0445,
         -0.1418, -0.0419]], requires_grad=True)
Parameter containing:
tensor([0.1002, 0.0733], requires_grad=True)


In [24]:
# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = data[0]
bow_vector = make_bow_vector(sample[0], word_to_ix)
log_probs = model(autograd.Variable(bow_vector, requires_grad=True))
print(log_probs)

tensor([[-0.6489, -0.7394]], grad_fn=<LogSoftmaxBackward0>)


  return F.log_softmax(self.linear(bow_vec))


In [25]:
label_to_ix = { "SPANISH": 0, "ENGLISH": 1 }

In [26]:
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix), requires_grad=True)
    log_probs = model(bow_vec)
    print(log_probs)

print(next(model.parameters())[:, word_to_ix["creo"]])

tensor([[-0.6064, -0.7882]], grad_fn=<LogSoftmaxBackward0>)
tensor([[-0.7394, -0.6489]], grad_fn=<LogSoftmaxBackward0>)
tensor([-0.0372,  0.1286], grad_fn=<SelectBackward0>)


  return F.log_softmax(self.linear(bow_vec))


In [28]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(100):
    for instance, label in data:
        # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out
        # before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a Variable
        # as an integer.  For example, if the target is SPANISH, then we wrap the integer
        # 0.  The loss function then knows that the 0th element of the log probabilities is
        # the log probability corresponding to SPANISH
        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
        target = autograd.Variable(make_target(label, label_to_ix))

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by calling
        # optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

  return F.log_softmax(self.linear(bow_vec))


In [31]:
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)
print(next(model.parameters())[:,word_to_ix["creo"]]) # Index corresponding to Spanish goes up, English goes down!

tensor([[-0.1231, -2.1556]], grad_fn=<LogSoftmaxBackward0>)
tensor([[-2.7641, -0.0651]], grad_fn=<LogSoftmaxBackward0>)
tensor([ 0.3847, -0.2932], grad_fn=<SelectBackward0>)


  return F.log_softmax(self.linear(bow_vec))


# 6. Word Embeddings: Encoding Lexical Semantics