In [None]:
import numpy as np
from numpy.random import randn

In [None]:
n_y = 10
n_s = 12
hid = 64
batch = 256
h = randn(batch, hid)
W = randn(batch, hid, n_s)
X = randn(batch, n_s, n_y, hid)

In [None]:
print(np.einsum_path('bsyh,bhs,bh->by', X, W, h, optimize='optimal')[1])

  Complete contraction:  bsyh,bhs,bh->by
         Naive scaling:  4
     Optimized scaling:  4
      Naive FLOP count:  5.898e+06
  Optimized FLOP count:  4.129e+06
   Theoretical speedup:  1.429
  Largest intermediate:  1.966e+05 elements
--------------------------------------------------------------------------
scaling                  current                                remaining
--------------------------------------------------------------------------
   3                 bh,bhs->shb                             bsyh,shb->by
   4                shb,bsyh->by                                   by->by


In [None]:
def opt(X, W, h):
    tmp = np.einsum('bh,bhs->bhs', h, W)
    return np.einsum('bhs,bsyh->by', tmp, X)

In [None]:
%timeit -r 16 -n 100 np.einsum('bsyh,bhs,bh->by', X, W, h, optimize='optimal')
%timeit -r 16 -n 100 opt(X, W, h)

17.9 ms ± 861 µs per loop (mean ± std. dev. of 16 runs, 100 loops each)
17.2 ms ± 889 µs per loop (mean ± std. dev. of 16 runs, 100 loops each)


In [None]:
opt(X, W, h).shape

(256, 10)

In [None]:
import torch
import random

In [None]:
n_seq = 100000
A = torch.arange(n_seq)
idx = [random.randrange(0, n_seq) for i in range(n_seq//100)]
idx_t = torch.LongTensor(idx)

In [None]:
%timeit A

16 ns ± 0.392 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [None]:
%timeit A[idx]

42 µs ± 170 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
%timeit A[idx_t]

9.83 µs ± 150 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [None]:
from torch import linalg as LA

In [None]:
A = torch.rand(9)
A = A.reshape((3, 3))

In [None]:
A

tensor([[0.2525, 0.3562, 0.3648],
        [0.6086, 0.2449, 0.7898],
        [0.2237, 0.0689, 0.3048]])

In [None]:
hx = torch.rand(3*2*4, dtype=torch.float).reshape((3, 2, 4))
hx

tensor([[[0.4991, 0.6107, 0.2169, 0.9104],
         [0.5327, 0.8375, 0.8371, 0.3965]],

        [[0.6531, 0.0169, 0.5115, 0.1136],
         [0.3169, 0.7598, 0.0614, 0.7085]],

        [[0.4336, 0.8189, 0.0227, 0.5080],
         [0.6851, 0.7801, 0.4693, 0.5754]]])

In [None]:
hx = torch.div(hx, LA.norm(hx, dim=2).reshape(3, 2, -1))
hx

tensor([[[0.4078, 0.4990, 0.1772, 0.7438],
         [0.3924, 0.6169, 0.6166, 0.2920]],

        [[0.7798, 0.0202, 0.6108, 0.1357],
         [0.2913, 0.6985, 0.0565, 0.6512]],

        [[0.4102, 0.7748, 0.0215, 0.4806],
         [0.5367, 0.6112, 0.3677, 0.4508]]])

In [None]:
torch.tensordot(hx, hx, dims=([1, 2], [1, 2]))

tensor([[2.0000, 1.3074, 1.8612],
        [1.3074, 2.0000, 1.3115],
        [1.8612, 1.3115, 2.0000]])

In [None]:
dot = torch.tensordot(hx, hx, dims=([1, 2], [1, 2]))
dot / dot[0][0]

tensor([[1.0000, 0.6537, 0.9306],
        [0.6537, 1.0000, 0.6557],
        [0.9306, 0.6557, 1.0000]])

In [65]:
import torch
from time import sleep
from torch import multiprocessing

def _construct_childs(n_seq, parent):
    childs = [[] for i in range(n_seq)]
    for idx in range(n_seq):
        if parent[idx] >= 0:
            childs[parent[idx]].append(idx)
    return childs

def work(i, h):
    sleep(i)
    h[i] += 10

def dfs(x, h, child):
    print(x, child[x])
    pros = [ctx.Process(target=dfs, args=(i, h, child, )) for i in child[x]]
    for p in pros: p.start()
    for p in pros: p.join()
    work(x, h)
    print(x)

def new_work(i, h):
    print(i)
    sleep(i)
    h[i] += 10

if __name__ == '__main__':
    ctx = multiprocessing.get_context("fork")
    parent = [-1, 0, 0, 0, 2, 3]
    child = _construct_childs(len(parent), parent)
    print(child)
    h = torch.linspace(1, len(parent), len(parent), device='cuda')
    c = torch.LongTensor([len(x) for x in child])
    h.share_memory_()
    while True:
        chk = False
        procs = []
        print(c)
        for idx, cnt in enumerate(c):
            if cnt == 0:
                chk = True
                p = ctx.Process(target=new_work, args=(idx, h, ))
                p.start()
                procs.append(p)
                c[idx] -= 1
                if parent[idx] >= 0:
                    c[parent[idx]] -= 1
        if not chk: break
        for p in procs: p.join()
    print(h)

[[1, 2, 3], [], [4], [5], [], []]
tensor([3, 0, 1, 1, 0, 0])
1
4
5


Process ForkProcess-124:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-65-da85e23821cf>", line 27, in new_work
    h[i] += 10
RuntimeError: CUDA error: initialization error
Process ForkProcess-125:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-65-da85e23821cf>", line 27, in new_work
    h[i] += 10
RuntimeError: CUDA error: initialization error
Process ForkProcess-126:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/proces

tensor([ 2, -1,  0,  0, -1, -1])
2
3


Process ForkProcess-127:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-65-da85e23821cf>", line 27, in new_work
    h[i] += 10
RuntimeError: CUDA error: initialization error
Process ForkProcess-128:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-65-da85e23821cf>", line 27, in new_work
    h[i] += 10
RuntimeError: CUDA error: initialization error


tensor([ 0, -1, -1, -1, -1, -1])
0


Process ForkProcess-129:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-65-da85e23821cf>", line 27, in new_work
    h[i] += 10
RuntimeError: CUDA error: initialization error


tensor([-1, -1, -1, -1, -1, -1])
tensor([1., 2., 3., 4., 5., 6.], device='cuda:0')
