# NLP examples with Pytorch 
网址：http://pytorch.org/tutorials/beginner/nlp/pytorch_tutorial.html

An introduction of transfer learning in online tutorial of Pytorch

## 一、introduction to Pytorch

In [69]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1e75e28>

## creating Tensor

In [70]:
# using existing data
v_data = [1, 2, 3]
V=torch.Tensor(v_data)
print(V)


 1
 2
 3
[torch.FloatTensor of size 3]



In [71]:
# crate a matrix
M_data = [[1, 2, 3], [3, 4, 5]]
M=torch.Tensor(M_data)
print(M)


 1  2  3
 3  4  5
[torch.FloatTensor of size 2x3]



In [72]:
# 3D tensor
T_data = [[[1, 2], [3, 4]],
               [[5,6], [7, 8]] ]
T=torch.Tensor(T_data)
print(T)


(0 ,.,.) = 
  1  2
  3  4

(1 ,.,.) = 
  5  6
  7  8
[torch.FloatTensor of size 2x2x2]



In [73]:
# index into V get a scalar
print(V[0])

1.0


In [74]:
# index into M and get a vector
print(M[0])


 1
 2
 3
[torch.FloatTensor of size 3]



In [75]:
# index into T and get a matrix
print(T[0])


 1  2
 3  4
[torch.FloatTensor of size 2x2]



默认的数据类型是Float,当然可以指定别的类型

In [76]:
torch.LongTensor(1)  # 长整型


 1.3996e+14
[torch.LongTensor of size 1]

In [77]:
# using random data
x = torch.randn((3, 4, 5)) # 3D tensor ; 3个第三维度，4*5每个matrix
print(x)


(0 ,.,.) = 
 -2.9718  1.7070 -0.4305 -2.2820  0.5237
  0.0004 -1.2039  3.5283  0.4434  0.5848
  0.8407  0.5510  0.3863  0.9124 -0.8410
  1.2282 -1.8661  1.4146 -1.8781 -0.4674

(1 ,.,.) = 
 -0.7576  0.4215 -0.4827 -1.1198  0.3056
  1.0386  0.5206 -0.5006  1.2182  0.2117
 -1.0613 -1.9441 -0.9596  0.5489 -0.9901
 -0.3826  1.5037  1.8267  0.5561  1.6445

(2 ,.,.) = 
  0.4973 -1.5067  1.7661 -0.3569 -0.1713
  0.4068 -0.4284 -1.1299  1.4274 -1.4027
  1.4825 -1.1559  1.6190  0.9581  0.7747
  0.1940  0.1687  0.3061  1.0743 -1.0327
[torch.FloatTensor of size 3x4x5]



In [78]:
torch.randn(3,4,4,5) # 4维度的


(0 ,0 ,.,.) = 
  1.0930  0.7769 -1.3128  0.7099  0.9944
 -0.2694 -0.6491 -0.1373 -0.2954 -0.7725
 -0.2215  0.5074 -0.6794 -1.6115  0.5230
 -0.8890  0.2620  0.0302  0.0013 -1.3987

(0 ,1 ,.,.) = 
  1.4666 -0.1028 -0.0097 -0.8420 -0.2067
  1.0672  0.1732 -0.6873  0.3111  0.2358
 -1.0658  0.3620  0.3776 -0.2443 -0.5850
  2.0812 -0.1186  0.4903  0.8349  0.8894

(0 ,2 ,.,.) = 
  0.4148  0.0507 -0.9644 -2.0111  0.5245
  2.1332 -0.0822  0.8388 -1.3233  0.0701
  1.2200  0.4251 -1.2328 -0.6195  1.5133
  1.9954 -0.6585 -0.4139 -0.2250 -0.6890

(0 ,3 ,.,.) = 
  0.9882  0.7404 -2.0990  1.2582 -0.3990
 -1.0952 -1.0703  0.6404  1.6199  0.5258
 -0.2969 -0.0681 -0.2831 -0.4705 -1.7655
 -0.1656  0.2312 -0.0839 -1.7731 -1.0721

(1 ,0 ,.,.) = 
  1.0248 -0.7116  0.7081  0.8288  1.3526
  1.6200  0.3436 -0.9112 -0.9952  0.7455
  0.7371  1.2528  0.8503 -0.4165 -0.7499
  1.0632  0.0073 -1.4252 -0.0781 -0.5138

(1 ,1 ,.,.) = 
  1.1375 -1.0246 -1.0300 -1.0129  0.0055
 -0.9347 -0.9882  1.3801 -0.1173  0.9317
  

## Operations and Tensors


In [79]:
x = torch.Tensor([1., 2., 3.])
x


 1
 2
 3
[torch.FloatTensor of size 3]

In [80]:
y = torch.Tensor([4., 5., 6.])
z = x + y
z


 5
 7
 9
[torch.FloatTensor of size 3]

#### concatenation

In [81]:
x_1 = torch.randn(2,5)
y_1 = torch.randn(3,5)
z_1 = torch.cat([x_1, y_1])
z_1


 0.7511  0.3649  0.9262 -0.1932 -0.7291
-0.3325  0.1134  0.3753 -0.0084  0.5745
-0.0230 -0.5933  0.7945  2.2256 -0.8400
-0.4712  0.3147 -0.0600  0.9394 -1.3487
 0.0537  0.8407 -0.6757  0.0265 -2.0634
[torch.FloatTensor of size 5x5]

In [82]:
x_1


 0.7511  0.3649  0.9262 -0.1932 -0.7291
-0.3325  0.1134  0.3753 -0.0084  0.5745
[torch.FloatTensor of size 2x5]

In [83]:
y_1


-0.0230 -0.5933  0.7945  2.2256 -0.8400
-0.4712  0.3147 -0.0600  0.9394 -1.3487
 0.0537  0.8407 -0.6757  0.0265 -2.0634
[torch.FloatTensor of size 3x5]

In [84]:
# 矩阵拼接，默认是第0维度，就是行；
# 连接列
torch.cat([x_1, torch.randn(2,5)], 1)


 0.7511  0.3649  0.9262 -0.1932 -0.7291 -1.9366  1.0067 -1.8593  0.9329  1.4066
-0.3325  0.1134  0.3753 -0.0084  0.5745  1.4414  0.1690  0.2575  0.1212 -1.8270
[torch.FloatTensor of size 2x10]

### reshape Tensor
就是重塑张量的维度

In [85]:
x = torch.randn(2, 3, 5)
x.view(2, 15)



Columns 0 to 9 
 0.1571 -1.3312 -1.0505 -1.0007 -0.4621 -0.5060  1.1233  0.4800 -0.0344 -0.4928
-0.0653 -2.2272 -0.5412 -0.9734 -0.0499  0.5303  1.5544  0.6882 -0.4737  0.5039

Columns 10 to 14 
-0.2699 -0.8699  0.8155 -0.6616 -0.2193
-0.2694 -0.2859  0.1109  0.4339  0.0103
[torch.FloatTensor of size 2x15]

In [86]:
x


(0 ,.,.) = 
  0.1571 -1.3312 -1.0505 -1.0007 -0.4621
 -0.5060  1.1233  0.4800 -0.0344 -0.4928
 -0.2699 -0.8699  0.8155 -0.6616 -0.2193

(1 ,.,.) = 
 -0.0653 -2.2272 -0.5412 -0.9734 -0.0499
  0.5303  1.5544  0.6882 -0.4737  0.5039
 -0.2694 -0.2859  0.1109  0.4339  0.0103
[torch.FloatTensor of size 2x3x5]

In [87]:
# 如果一个维度为-1，它的维度可以被推断出来
x.view((3,-1))


 0.1571 -1.3312 -1.0505 -1.0007 -0.4621 -0.5060  1.1233  0.4800 -0.0344 -0.4928
-0.2699 -0.8699  0.8155 -0.6616 -0.2193 -0.0653 -2.2272 -0.5412 -0.9734 -0.0499
 0.5303  1.5544  0.6882 -0.4737  0.5039 -0.2694 -0.2859  0.1109  0.4339  0.0103
[torch.FloatTensor of size 3x10]

## 计算图与automatic Differentiation
The Variable class keeps track of how it was created

In [88]:
from torch.autograd import Variable

x = Variable(torch.Tensor([1., 2., 3]), requires_grad=True)
x.data


 1
 2
 3
[torch.FloatTensor of size 3]

In [89]:
y = Variable(torch.Tensor([4., 5., 6]), requires_grad=True)
z = x + y
z.data


 5
 7
 9
[torch.FloatTensor of size 3]

In [90]:
z.creator

<torch.autograd._functions.basic_ops.Add at 0x3aba4d8>

可以看出Variables知道他们是怎么是产生的，即它会记录其creator

In [91]:
s = z.sum()
s

Variable containing:
 21
[torch.FloatTensor of size 1]

In [92]:
s.creator

<torch.autograd._functions.reduce.Sum at 0x3aba3f0>

In [93]:
s.backward()
x.grad

Variable containing:
 1
 1
 1
[torch.FloatTensor of size 3]

In [94]:
s.backward()
x.grad     # 梯度会堆积起来

Variable containing:
 2
 2
 2
[torch.FloatTensor of size 3]

+ 该如何理解variable可以追踪其Creator呢？其实variable由别的variable计算而来，自己本身就会存储源头的信息和怎么来的计算；
而其对应的tensor中则没有相关信息，因此单独地将其data中的Tensor提取出来再存进variable中就会丢失其Creator信息。

In [95]:
 x = torch.Tensor((2, 3))
y = torch.Tensor((2,3))
z = x+ y
z


 4
 6
[torch.FloatTensor of size 2]

In [96]:
var_x = Variable(x)
var_y = Variable(y)
var_z = var_x + var_y
var_z

Variable containing:
 4
 6
[torch.FloatTensor of size 2]

In [97]:
var_z.creator

<torch.autograd._functions.basic_ops.Add at 0x3aba220>

In [98]:
# 如果将var_z的数据提取出来再建立一个新的Variable,就会丢失
var_z_data = var_z.data
new = Variable(var_z_data)
print(new.creator)

None


**因此，我们在code中需要注意不能过多地打断链式法则**

## 二、deep learning with Pytorch

### Affine Maps 映射
$$f(x)=Ax+b$$
此处需要进行学习的就是$A$和$b$

**注**：Pytorch中处理映射的时候，使用的是行，而不是使用列。ith row of the output is the map of ith row of input and b;

In [99]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

torch.manual_seed(1)

<torch._C.Generator at 0x1e75e28>

$\mbox{maps from} R^5 \mbox{to} R^3$     从维度是5降到维度是3

In [100]:
lin = nn.Linear(5, 3)

# 由于框架中是对于行进行操作，可知5维的输入，即要求列有5列；
# 即 n * 5 的矩阵映射成 n * 3

data = Variable(torch.randn(10, 5))
lin(data)

Variable containing:
 0.3130  0.2576  1.3546
 1.0007  0.6433  0.4951
-0.3767 -0.1680  0.5511
-0.6396 -1.1769 -0.0147
-0.4327 -0.2869 -0.1978
 0.5218  0.6403  0.6204
-1.4568  0.3256 -0.5250
 0.3919 -0.5869 -0.1451
 0.7158  0.2852 -0.1099
 0.2985 -0.3870  0.1984
[torch.FloatTensor of size 10x3]

### Non-linearity
线性函数堆积起来仍然是线性函数
有一些核心的非线性函数  $tanh(x), \sigma(x), ReLU(x) $,他们的优势是梯度计算起来比较方便

在pytorch中线性函数 在pytorch.functional中，他们没有参数

In [101]:
data = Variable(torch.randn(2, 2))
print(data)
print(F.relu(data))

Variable containing:
-0.2954 -0.7725
-0.2215  0.5074
[torch.FloatTensor of size 2x2]

Variable containing:
 0.0000  0.0000
 0.0000  0.5074
[torch.FloatTensor of size 2x2]



### softmax and probabilities
softmax的output是一个概率分布；

In [102]:
data = Variable(torch.randn(5))
print(data)
print(F.softmax(data))
print(F.softmax(data).sum())
print(F.log_softmax(data))

Variable containing:
-0.6794
-1.6115
 0.5230
-0.8890
 0.2620
[torch.FloatTensor of size 5]

Variable containing:
 0.1235
 0.0486
 0.4111
 0.1002
 0.3166
[torch.FloatTensor of size 5]

Variable containing:
 1
[torch.FloatTensor of size 1]

Variable containing:
-2.0914
-3.0235
-0.8890
-2.3010
-1.1500
[torch.FloatTensor of size 5]



### Objective Funcions
### optimization and Training
optim的选择不同，可能训练效果不同

### Creating Network Components in Pytorch
### Example: logistic regression Bag-of-words classifier

词袋分类器：假如只有 hello 和 world两个词，则 ''hello hello world hello的向量表示是
$$[3, 1]$$,而‘hello hello’的向量表示是:
$$[2,0]$$

定义此向量为$x$， 则神经网络的output为 $log softmax(Ax+b)$

In [103]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]
data

[(['me', 'gusta', 'comer', 'en', 'la', 'cafeteria'], 'SPANISH'),
 (['Give', 'it', 'to', 'me'], 'ENGLISH'),
 (['No', 'creo', 'que', 'sea', 'una', 'buena', 'idea'], 'SPANISH'),
 (['No',
   'it',
   'is',
   'not',
   'a',
   'good',
   'idea',
   'to',
   'get',
   'lost',
   'at',
   'sea'],
  'ENGLISH')]

In [104]:
test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

word_to_ix = {}

for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word]=len(word_to_ix)
print(word_to_ix)

{'en': 3, 'No': 9, 'buena': 14, 'it': 7, 'at': 22, 'sea': 12, 'cafeteria': 5, 'Yo': 23, 'la': 4, 'to': 8, 'creo': 10, 'is': 16, 'a': 18, 'good': 19, 'get': 20, 'idea': 15, 'que': 11, 'not': 17, 'me': 0, 'on': 25, 'gusta': 1, 'lost': 21, 'Give': 6, 'una': 13, 'si': 24, 'comer': 2}


In [106]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec))


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the Pytorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = data[0]
bow_vector = make_bow_vector(sample[0], word_to_ix)
log_probs = model(autograd.Variable(bow_vector))
print(log_probs)

Parameter containing:

Columns 0 to 9 
 0.0984  0.0541  0.0886 -0.1466  0.1503  0.0746  0.0485  0.0580  0.0984 -0.0573
 0.1763 -0.1710 -0.0196 -0.0568  0.0307  0.1733 -0.0360 -0.0471 -0.1031  0.1031

Columns 10 to 19 
-0.0593  0.1032 -0.0902 -0.0563  0.1553  0.0992 -0.0282  0.1496  0.1823 -0.1915
 0.1582  0.1065  0.0289 -0.0779 -0.1950  0.1070  0.0459 -0.1361 -0.0680  0.0308

Columns 20 to 25 
 0.0641 -0.0007  0.0477 -0.1672 -0.1511  0.1126
 0.0106 -0.1926  0.1514  0.0820 -0.0560 -0.0115
[torch.FloatTensor of size 2x26]

Parameter containing:
 0.1602
 0.1038
[torch.FloatTensor of size 2]

Variable containing:
-0.5790 -0.8220
[torch.FloatTensor of size 1x2]



In [108]:
label_to_ix = {"SPANISH": 0, "ENGLISH": 1}

for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)

# Print the matrix column corresponding to "creo"
print(next(model.parameters())[:, word_to_ix["creo"]])

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(100):
    for instance, label in data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Variable as an integer. For example, if the target is SPANISH, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to SPANISH
        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
        target = autograd.Variable(make_target(label, label_to_ix))

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)

# Index corresponding to Spanish goes up, English goes down!
print(next(model.parameters())[:, word_to_ix["creo"]])

Variable containing:
-0.9794 -0.4708
[torch.FloatTensor of size 1x2]

Variable containing:
-0.5436 -0.8690
[torch.FloatTensor of size 1x2]

Variable containing:
-0.0593
 0.1582
[torch.FloatTensor of size 2]

Variable containing:
-0.2212 -1.6174
[torch.FloatTensor of size 1x2]

Variable containing:
-2.6540 -0.0730
[torch.FloatTensor of size 1x2]

Variable containing:
 0.3903
-0.2913
[torch.FloatTensor of size 2]

