In [1]:
from __future__ import print_function
import numpy as np
import torch
import datetime

In [2]:
# Num of multiplications to perform
n = 10

# Create random large matrix
A = np.random.rand(10000, 10000).astype('float32')
B = np.random.rand(10000, 10000).astype('float32')


In [3]:
def matpow(M, n):
    if n < 1: #Abstract cases where n < 1
        return M
    else:
        return torch.matmul(M, matpow(M, n-1))

In [4]:
a = torch.from_numpy(A)
b = torch.from_numpy(B)

In [5]:
# cpu
t1 = datetime.datetime.now()
an = matpow(a, n)
bn = matpow(b, n)
t2 = datetime.datetime.now()
print("CPU computation time: " + str(t2-t1))

CPU computation time: 0:00:01.905853


In [12]:
# single gpu
t1 = datetime.datetime.now()
cuda0 = torch.device('cuda:0')
with torch.no_grad():
    with torch.cuda.device(1):
        an=matpow(a.to(device=cuda0), n)
        bn=matpow(b.to(device=cuda0), n)
t2 = datetime.datetime.now()
print("Single GPU computation time: " + str(t2-t1))

Single GPU computation time: 0:00:05.339965


In [6]:
# multi gpu with cuda stream
# https://stackoverflow.com/questions/44371682/basic-multi-gpu-parallelization-of-matrix-multiplication
# https://stackoverflow.com/questions/52498690/how-to-use-cuda-stream-in-pytorch
# https://towardsdatascience.com/speed-up-your-algorithms-part-1-pytorch-56d8a4ae7051
# https://github.com/tensorflow/tensorflow/issues/36634#issuecomment-608160593
s1 = torch.cuda.Stream()
s2 = torch.cuda.Stream() # these takes 0.05s
t1 = datetime.datetime.now()
A = a.cuda(0)
B = b.cuda(1)
torch.cuda.synchronize()
with torch.no_grad():
    with torch.cuda.stream(s1):
        an=matpow(A, n)
    with torch.cuda.stream(s2):
        bn=matpow(B, n)
torch.cuda.synchronize()
t2 = datetime.datetime.now()
print("Multiple GPU computation time: " + str(t2-t1))

Multiple GPU computation time: 0:00:05.268709


In [13]:

s1 = torch.cuda.Stream()
s2 = torch.cuda.Stream() # these takes 0.05s
t1 = datetime.datetime.now()
A = a.cuda(0)
B = b.cuda(1)
torch.cuda.synchronize()
with torch.no_grad():
    with torch.cuda.stream(s1):
        an=matpow(A, n)
    with torch.cuda.stream(s2):
        bn=matpow(B, n)
torch.cuda.synchronize()
t2 = datetime.datetime.now()
print("Multiple GPU computation time: " + str(t2-t1))

Multiple GPU computation time: 0:00:05.307378


In [25]:
an.device

device(type='cuda', index=0)

In [26]:
bn.device

device(type='cuda', index=1)

In [5]:
# negative example: multi gpu (does not work)
cuda0 = torch.device('cuda:0')
cuda1 = torch.device('cuda:1')
t1 = datetime.datetime.now()
with torch.no_grad():
    with torch.cuda.device(0):
        an=matpow(a.to(device=cuda0), n)
    with torch.cuda.device(1):
        bn=matpow(b.to(device=cuda1), n)
t2 = datetime.datetime.now()
print("Multiple GPU computation time: " + str(t2-t1))


Multiple GPU computation time: 0:00:08.085382


In [15]:
del an, bn, A, B
with torch.no_grad():
    torch.cuda.empty_cache()

In [20]:
import tensorflow as tf
x = tf.constant([1, 4])
y = tf.constant([2, 5])
z = tf.constant([3, 6])

In [21]:
tf.concat([x, y, z], -1)

<tf.Tensor: id=22, shape=(6,), dtype=int32, numpy=array([1, 4, 2, 5, 3, 6], dtype=int32)>

In [34]:
x = torch.tensor([1, 4])
y = torch.tensor([2, 5])
z = torch.tensor([3, 6])

In [41]:
a = torch.randn((5,6))
torch.argmax(a, 1)

tensor([0, 2, 0, 1, 1])

In [44]:
b = tf.constant(a.numpy())

In [46]:
tf.argmax(b, -1)

<tf.Tensor: id=27, shape=(5,), dtype=int64, numpy=array([0, 2, 0, 1, 1])>