In [1]:
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
sess = tf.InteractiveSession()

In [3]:
x = np.random.randint(0, 4, (2,2,6,6)).astype('float32')

In [4]:
x.shape

(2, 2, 6, 6)

In [5]:
tf_x = tf.convert_to_tensor(x)
to_x = torch.from_numpy(x)

In [6]:
(to_x.numpy() == tf_x.eval()).all()

True

### Conv2d

In [7]:
w = np.random.randint(0, 2, (3,2,3,3)).astype('float32')
w.shape

(3, 2, 3, 3)

In [8]:
stride=2

PyTorch

input=(n, c_in, h, w), weights=(c_out, c_in, h, w)

In [9]:
to_w = torch.from_numpy(w)

In [10]:
to_out = F.conv2d(to_x, to_w, stride=stride, padding=1).numpy()

to_out.shape

(2, 3, 3, 3)

TensorFlow

input=(n, h, w, c_in), weights=(h, w, c_in, c_out)

In [11]:
tf_x_raw = tf.convert_to_tensor(x)
tf_x = tf.transpose(tf_x_raw, [0,2,3,1])
print(tf_x)


tf_w_raw = tf.convert_to_tensor(w)
tf_w = tf.transpose(tf_w_raw, [2,3,1,0])
print(tf_w)

Tensor("transpose:0", shape=(2, 6, 6, 2), dtype=float32)
Tensor("transpose_1:0", shape=(3, 3, 2, 3), dtype=float32)


In [12]:
tf_out = tf.nn.conv2d(tf_x, tf_w, strides=[1, stride, stride, 1], padding='SAME').eval().transpose([0,3,1,2])

tf_out.shape

(2, 3, 3, 3)

In [13]:
tf_out[0]

array([[[ 7.,  4.,  4.],
        [ 1.,  6.,  3.],
        [ 6., 10.,  4.]],

       [[ 4.,  3.,  6.],
        [ 9.,  4.,  7.],
        [ 3.,  0.,  0.]],

       [[14., 18., 13.],
        [11., 11.,  8.],
        [16., 12., 11.]]], dtype=float32)

In [14]:
to_out[0]

array([[[ 0.,  0.,  2.],
        [ 1.,  5.,  4.],
        [ 2.,  4.,  1.]],

       [[ 6.,  4.,  7.],
        [ 1.,  3.,  3.],
        [ 4.,  5.,  8.]],

       [[ 6.,  8., 10.],
        [ 4., 14., 12.],
        [ 9., 17., 10.]]], dtype=float32)

In [15]:
np.all(to_out == tf_out)

False

Outputs are **not** the same. This is due to different padding strategy in cases of odd sized kernel on even sized input which may result in assymetric padding. Pytorch will pad evenly on either side but some padding on the right/bottom side may be unused. Tensorflow will pad as much as is necessary with any extra padding going on the right/bottom of the tensor.

Example for this case:

##### Pytorch
[oxx]xxxxo ox[xxx]xxo oxxx[xxx]o

##### Tensorflow
[xxx]xxxo xx[xxx]xo xxxx[xxo]

Will use odd sized input for further comparisons

In [16]:
x = np.random.randint(0, 4, (2,2,5,5)).astype('float32')
x.shape

(2, 2, 5, 5)

In [17]:
w = np.random.randint(0, 2, (3,2,3,3)).astype('float32')
w.shape

(3, 2, 3, 3)

Pytorch

In [18]:
to_x = torch.from_numpy(x)
to_w = torch.from_numpy(w)

In [19]:
to_conv2d = F.conv2d(to_x, to_w, stride=stride, padding=1).numpy()

to_out.shape

(2, 3, 3, 3)

Tensorflow

In [20]:
tf_x_raw = tf.convert_to_tensor(x)
tf_x = tf.transpose(tf_x_raw, [0,2,3,1])

tf_w_raw = tf.convert_to_tensor(w)
tf_w = tf.transpose(tf_w_raw, [2,3,1,0])

In [21]:
tf_conv2d = tf.nn.conv2d(tf_x, tf_w, strides=[1, stride, stride, 1], padding='SAME').eval().transpose([0,3,1,2])

tf_out.shape

(2, 3, 3, 3)

In [22]:
np.all(to_conv2d == tf_conv2d)

True

### (Leaky) ReLU

Pytorch

In [23]:
to_lrelu = F.leaky_relu(to_x, 0.2).numpy()

to_lrelu.shape

(2, 2, 5, 5)

In [24]:
tf_lrelu = tf.maximum(tf_x, tf_x * 0.2).eval().transpose([0,3,1,2])

tf_lrelu.shape

(2, 2, 5, 5)

In [25]:
np.all(to_lrelu == tf_lrelu)

True

### Batch normalization

Pytorch

In [26]:
nn.Parameter(torch.Tensor(2)).data.uniform_()

tensor([0.1250, 0.3128])

In [27]:
to_batch_norm_layer = nn.BatchNorm2d(2, momentum=None, eps=1e-5, affine=True)

In [28]:
to_batch_norm_layer(to_x)[0,0]

tensor([[ 0.4098,  0.1568, -0.3491,  0.1568, -0.0961],
        [-0.3491, -0.3491, -0.0961,  0.4098,  0.4098],
        [-0.3491, -0.3491, -0.0961, -0.0961, -0.3491],
        [-0.0961, -0.0961,  0.4098, -0.0961,  0.4098],
        [-0.0961,  0.4098,  0.4098,  0.1568, -0.3491]],
       grad_fn=<SelectBackward>)

Tensorflow

In [29]:
tf_batchnorm_layer = tf.layers.batch_normalization(tf_x, momentum=0.9, epsilon=1e-5, training=True)

sess.run(tf.global_variables_initializer())
tf_batchnorm = tf_batchnorm_layer.eval().transpose([0,3,1,2])

In [30]:
tf_batchnorm[0,0]

array([[ 1.3336109 ,  0.51039433, -1.1360389 ,  0.51039433, -0.3128223 ],
       [-1.1360389 , -1.1360389 , -0.3128223 ,  1.3336109 ,  1.3336109 ],
       [-1.1360389 , -1.1360389 , -0.3128223 , -0.3128223 , -1.1360389 ],
       [-0.3128223 , -0.3128223 ,  1.3336109 , -0.3128223 ,  1.3336109 ],
       [-0.3128223 ,  1.3336109 ,  1.3336109 ,  0.51039433, -1.1360389 ]],
      dtype=float32)

### Cross entropy

In [31]:
x0 = np.array([[-1],[0],[1]], dtype=float)
x0.shape

(3, 1)

In [32]:
y0 = np.array([1,1,1], dtype=int)
y0.shape

(3,)

Pytorch

In [33]:
to_x0 = torch.from_numpy(x0)
to_y0 = torch.LongTensor(y0)

In [34]:
torch.sigmoid(to_x0)

tensor([[0.2689],
        [0.5000],
        [0.7311]], dtype=torch.float64)

Behaviour identical to softmax when including a null class
[link](https://stats.stackexchange.com/questions/87248/is-binary-logistic-regression-a-special-case-of-multinomial-logistic-regression#comment609940_87270)

In [35]:
x1 = np.array([[-1, 0],[0, 0], [1,0]], dtype=float)
x1.shape

(3, 2)

In [36]:
to_x1 = torch.from_numpy(x1)

In [37]:
F.softmax(to_x1, dim=1)

tensor([[0.2689, 0.7311],
        [0.5000, 0.5000],
        [0.7311, 0.2689]], dtype=torch.float64)

In [38]:
F.softmax(to_x0, dim=1)

tensor([[1.],
        [1.],
        [1.]], dtype=torch.float64)

In [39]:
to_x0

tensor([[-1.],
        [ 0.],
        [ 1.]], dtype=torch.float64)

In [40]:
to_y0

tensor([1, 1, 1])

Pytorch

In [41]:
x2 = np.array([-100,-100,100,100], dtype=float)
y2 = np.array([1,0,1,0], dtype=float)

In [42]:
to_x2 = torch.FloatTensor(x2)
to_y2 = torch.FloatTensor(y2)
F.binary_cross_entropy_with_logits(input=to_x2, target=to_y2)

tensor(50.)

Tensorflow

In [43]:
tf_x2 = tf.convert_to_tensor(x2)
tf_y2 = tf.convert_to_tensor(y2)

tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y2, logits=x2)).eval()

50.0

### Autodiff

In [44]:
x = torch.ones(1, requires_grad=True)
x2 = x+2
x3 = x2*3
x4 = x3**2

In [45]:
print(x)
print(x2)
print(x3)
print(x4)

tensor([1.], requires_grad=True)
tensor([3.], grad_fn=<AddBackward0>)
tensor([9.], grad_fn=<MulBackward0>)
tensor([81.], grad_fn=<PowBackward0>)


In [46]:
print(x4.grad)
print(x3.grad)
print(x2.grad)
print(x.grad)

None
None
None
None


In [47]:
x4.grad_fn.next_functions[0][0].next_functions[0][0]

<AddBackward0 at 0x1f84e6da278>

In [48]:
x4.backward()

In [49]:
print(x4.grad)
print(x3.grad)
print(x2.grad)
print(x.grad)

None
None
None
tensor([54.])


In [50]:
x4.grad_fn.next_functions[0][0].next_functions[0][0]

<AddBackward0 at 0x1f84e6da278>

In [51]:
print(x)
print(x2)
print(x3)
print(x4)

tensor([1.], requires_grad=True)
tensor([3.], grad_fn=<AddBackward0>)
tensor([9.], grad_fn=<MulBackward0>)
tensor([81.], grad_fn=<PowBackward0>)


### Deconvolution

In [78]:
x = np.random.randint(0, 4, (1,1,19,19)).astype('float32')
w = np.random.randint(0, 2, (1,1,5,5)).astype('float32')

to_x = torch.from_numpy(x)
to_w = torch.from_numpy(w)

In [80]:
print(to_x.shape)
conv_out = F.conv2d(to_x, to_w, stride=2, padding=2)
print(conv_out.shape)
deconv_out = F.conv_transpose2d(conv_out, to_w, stride=2, padding=2, output_padding=0)
print(deconv_out.shape)

torch.Size([1, 1, 19, 19])
torch.Size([1, 1, 10, 10])
torch.Size([1, 1, 19, 19])


In [56]:
x

array([[[[1., 3., 0., 0., 1., 2., 1., 1., 1.],
         [0., 3., 0., 1., 0., 0., 3., 0., 0.],
         [2., 1., 1., 0., 3., 2., 1., 1., 3.],
         [0., 3., 2., 3., 3., 1., 0., 0., 1.],
         [0., 3., 1., 0., 0., 1., 2., 0., 1.],
         [2., 2., 3., 1., 2., 3., 0., 0., 0.],
         [1., 1., 3., 1., 0., 2., 2., 1., 1.],
         [2., 0., 2., 0., 3., 3., 1., 0., 2.],
         [2., 2., 0., 1., 2., 2., 1., 1., 3.]]]], dtype=float32)

In [57]:
w

array([[[[0., 1., 0., 0., 0.],
         [1., 0., 0., 1., 0.],
         [0., 1., 1., 0., 1.],
         [0., 0., 1., 0., 1.],
         [0., 0., 0., 0., 1.]]]], dtype=float32)