In [1]:
import numpy as np
from numpy.random import randn
from itertools import count
import sys

np.random.seed(1)

In [2]:
# expert_num = 8 # number of experts
# expert_dim = 3072
# emb_dim = 768 # embedding size
# token_num = 24576 * 4 # think token count, from unrolled batches possibly

expert_num = 5 # number of experts
expert_dim = 7
emb_dim = 11 # embedding size
token_num = 15 # think token count, from unrolled batches possibly

# Bypass weight
b = 0.1

# Export sizes and bypass weight to C++ code as well
exported = dict(
    expert_count=expert_num,
    expert_size=expert_dim,
    embedding_size=emb_dim,
    token_count=token_num,
    b=np.array([b], dtype=np.float32) # I don't care that those numbers are longs, but I want this multiplier to be float32.
)

# token count * embedding size
words = randn(token_num, emb_dim).astype(np.float32, order='C')
exported["src"] = words

In [3]:
# expert (3) * emb_size (5) * emb_size (5)
experts_w1 = randn(expert_num, emb_dim, expert_dim).astype(np.float32, order='C')
exported["experts_w1"] = experts_w1

# expert (3) * emb_prime_size (4) * emb_size (5)
experts_b1 = randn(expert_num, 1, expert_dim).astype(np.float32, order='C')
exported["experts_b1"] = experts_b1

# expert (3) * emb_size (5) * emb_size (5)
experts_w2 = randn(expert_num, expert_dim, emb_dim).astype(np.float32, order='C')
exported["experts_w2"] = experts_w2

# expert (3) * emb_prime_size (4) * emb_size (5)
experts_b2 = randn(expert_num, 1, emb_dim).astype(np.float32, order='C')
exported["experts_b2"] = experts_b2

In [6]:
# Generate overloaded top-1 router
# Expert 0 will have half of all tokens in a batch
# Other experts will split the rest equally among themselves

router = np.zeros((expert_num, token_num)).astype(np.float32)

# How many of all tokens will go to the overloaded expert
ratio = 0.25

# Fill it with ones, then shuffle.+++++ 
tokens_per_expert = np.int32(np.ceil((1 - ratio) * token_num / (expert_num - 1)))
overloaded_expert = np.int32(np.ceil(ratio * token_num))

print(tokens_per_expert, overloaded_expert)

shift = 0
for e in range(expert_num):
    if e == 0:
        for t in range(overloaded_expert):    
            router[e][t + shift] = 1
        shift += overloaded_expert
    else:
        for t in range(tokens_per_expert): 
            if t + shift >= token_num:
                break
            router[e][t + shift] = 1
        shift += tokens_per_expert
    

# np.set_printoptions(threshold=sys.maxsize)
# print(router)

#np.random.shuffle(np.transpose(router))

print(router.shape)
print(router)
print(np.sum(router, axis=0))

exported["router"] = router

3 4
(5, 15)
[[1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [7]:
# Generate random binary router

router = np.zeros((token_num, expert_num)).astype(np.float32)

# Fill it with ones with each expert getting token_num / expert_num amount of tokens in it. Then shuffle.
tokens_per_expert = token_num / expert_num

counter = 0
shift = 0
expert = 0
for t in range(token_num):
    router[t][expert + shift] = 1
    counter += 1
    if counter == tokens_per_expert:
        counter = 0
        shift += 1

#np.set_printoptions(threshold=sys.maxsize)
#print(router)

np.random.shuffle(router)
router = np.transpose(router)
print(router.shape)
print(router)
print(np.sum(router, axis=0))

exported["router"] = router

(5, 15)
[[0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1.]]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [5]:
words.dtype

dtype('float32')

In [8]:
b = 0.1

# words(7) * emb_size(4)
total_output = b * words # … so that if a word doesn't go through any expert it will still have some value

# Repeat for every expert, looking at routed do determine which words go to said expert
for n, expert_w1, expert_b1, expert_w2, expert_b2, mask in zip(count(), experts_w1, experts_b1, experts_w2, experts_b2, router):
    print(f"Expert {n}:")
    
    # select all words where the mask for this expert is > 0
    expert_input = words[mask.nonzero()]
    
    # classic matmul op for feed-forward I guess? + relu
    expert_output1 = np.maximum(0.0, np.add(np.matmul(expert_input, expert_w1), expert_b1))
    expert_output2 = np.maximum(0.0, np.add(np.matmul(expert_output1, expert_w2), expert_b2))

    # I did assignment here, but could also be addition
    total_output[mask.nonzero()] = expert_output2

    print(f"  {expert_input.shape} * {expert_w1.shape} + {expert_b1.shape} = {expert_output1.shape}\n")
    print(f"  {expert_output1.shape} * {expert_w2.shape} + {expert_b2.shape} = {expert_output2.shape}\n")
    
    exported[f"expert_{n}_src"] = expert_input
    exported[f"expert_{n}_dst"] = expert_output2
    
print(f"Total output: {total_output.shape}")
exported["dst"] = total_output

Expert 0:
  (3, 11) * (11, 7) + (1, 7) = (3, 7)

  (3, 7) * (7, 11) + (1, 11) = (3, 11)

Expert 1:
  (3, 11) * (11, 7) + (1, 7) = (3, 7)

  (3, 7) * (7, 11) + (1, 11) = (3, 11)

Expert 2:
  (3, 11) * (11, 7) + (1, 7) = (3, 7)

  (3, 7) * (7, 11) + (1, 11) = (3, 11)

Expert 3:
  (3, 11) * (11, 7) + (1, 7) = (3, 7)

  (3, 7) * (7, 11) + (1, 11) = (3, 11)

Expert 4:
  (3, 11) * (11, 7) + (1, 7) = (3, 7)

  (3, 7) * (7, 11) + (1, 11) = (3, 11)

Total output: (15, 11)


In [9]:
np.savez("data-toy-top-1-uniform.npz", **exported)

In [10]:
experts_w2[0].shape

(512, 256)

In [None]:
# Do grid search generation for various variable values. Save them to separate files.
# Do not run automatically LOL

expert_num_list = [8, 16, 32, 64, 128, 256] # number of experts
expert_dim_list = [256, 512, 1024, 2048, 4096]
emb_dim_list = [256, 512, 1024, 2048, 4096] # embedding size
token_num_list = [128, 256, 512, 1024, 2048, 4096, 8192] # think token count, from unrolled batches possibly


for expert_num in expert_num_list:
    for expert_dim in expert_dim_list:
        for emb_dim in emb_dim_list:
            for token_num in token_num_list:
                
                exported = dict()
                
                # Generate input
                words = randn(token_num, emb_dim).astype(np.float32)
                exported["src"] = words
                
                # Generate parameters
                experts_w1 = randn(expert_num, emb_dim, expert_dim).astype(np.float32)
                exported["experts_w1"] = experts_w1

                # expert (3) * emb_prime_size (4) * emb_size (5)
                experts_b1 = randn(expert_num, 1, expert_dim).astype(np.float32)
                exported["experts_b1"] = experts_b1

                # expert (3) * emb_size (5) * emb_size (5)
                experts_w2 = randn(expert_num, expert_dim, emb_dim).astype(np.float32)
                exported["experts_w2"] = experts_w2

                # expert (3) * emb_prime_size (4) * emb_size (5)
                experts_b2 = randn(expert_num, 1, emb_dim).astype(np.float32)
                exported["experts_b2"] = experts_b2

                b = 0.1

                # words(7) * emb_size(4)
                total_output = b * words # … so that if a word doesn't go through any expert it will still have some value

                # Repeat for every expert, looking at routed do determine which words go to said expert
                for n, expert_w1, expert_b1, expert_w2, expert_b2, mask in zip(count(), experts_w1, experts_b1, experts_w2, experts_b2, router):
                    #print(f"Expert {n}:")

                    # select all words where the mask for this expert is > 0
                    expert_input = words[mask.nonzero()]

                    # classic matmul op for feed-forward I guess? + relu
                    expert_output1 = np.maximum(0.0, np.add(np.matmul(expert_input, expert_w1), expert_b1))
                    expert_output2 = np.maximum(0.0, np.add(np.matmul(expert_output1, expert_w2), expert_b2))

                    # I did assignment here, but could also be addition
                    total_output[mask.nonzero()] = expert_output2

                    #print(f"  {expert_input.shape} * {expert_w1.shape} + {expert_b1.shape} = {expert_output1.shape}\n")
                    #print(f"  {expert_output1.shape} * {expert_w2.shape} + {expert_b2.shape} = {expert_output2.shape}\n")

                    exported[f"expert_{n}_src"] = expert_input
                    exported[f"expert_{n}_dst"] = expert_output2

                #print(f"Total output: {total_output.shape}")
                exported["dst"] = total_output
                
                file_name = "exp-num_" + str(expert_num) + "_exp-dim_" + str(expert_dim) + "_emb-dim_" + str(emb_dim) + "_token-num_" + str(token_num)
                np.savez_compressed(file_name + ".npz", **exported)
                print("Finished ", file_name)

Finished  exp-num_8_exp-dim_256_emb-dim_256_token-num_128
Finished  exp-num_8_exp-dim_256_emb-dim_256_token-num_256
Finished  exp-num_8_exp-dim_256_emb-dim_256_token-num_512
Finished  exp-num_8_exp-dim_256_emb-dim_256_token-num_1024
Finished  exp-num_8_exp-dim_256_emb-dim_256_token-num_2048
Finished  exp-num_8_exp-dim_256_emb-dim_256_token-num_4096
Finished  exp-num_8_exp-dim_256_emb-dim_256_token-num_8192
Finished  exp-num_8_exp-dim_256_emb-dim_512_token-num_128
Finished  exp-num_8_exp-dim_256_emb-dim_512_token-num_256
Finished  exp-num_8_exp-dim_256_emb-dim_512_token-num_512
Finished  exp-num_8_exp-dim_256_emb-dim_512_token-num_1024
Finished  exp-num_8_exp-dim_256_emb-dim_512_token-num_2048
Finished  exp-num_8_exp-dim_256_emb-dim_512_token-num_4096
Finished  exp-num_8_exp-dim_256_emb-dim_512_token-num_8192
Finished  exp-num_8_exp-dim_256_emb-dim_1024_token-num_128
Finished  exp-num_8_exp-dim_256_emb-dim_1024_token-num_256
Finished  exp-num_8_exp-dim_256_emb-dim_1024_token-num_512
Fin

Finished  exp-num_8_exp-dim_2048_emb-dim_4096_token-num_4096
Finished  exp-num_8_exp-dim_2048_emb-dim_4096_token-num_8192
Finished  exp-num_8_exp-dim_4096_emb-dim_256_token-num_128
Finished  exp-num_8_exp-dim_4096_emb-dim_256_token-num_256
Finished  exp-num_8_exp-dim_4096_emb-dim_256_token-num_512
Finished  exp-num_8_exp-dim_4096_emb-dim_256_token-num_1024
Finished  exp-num_8_exp-dim_4096_emb-dim_256_token-num_2048
Finished  exp-num_8_exp-dim_4096_emb-dim_256_token-num_4096
Finished  exp-num_8_exp-dim_4096_emb-dim_256_token-num_8192
Finished  exp-num_8_exp-dim_4096_emb-dim_512_token-num_128
Finished  exp-num_8_exp-dim_4096_emb-dim_512_token-num_256
Finished  exp-num_8_exp-dim_4096_emb-dim_512_token-num_512
Finished  exp-num_8_exp-dim_4096_emb-dim_512_token-num_1024
Finished  exp-num_8_exp-dim_4096_emb-dim_512_token-num_2048
Finished  exp-num_8_exp-dim_4096_emb-dim_512_token-num_4096
Finished  exp-num_8_exp-dim_4096_emb-dim_512_token-num_8192
Finished  exp-num_8_exp-dim_4096_emb-dim_102

Finished  exp-num_16_exp-dim_1024_emb-dim_4096_token-num_256
Finished  exp-num_16_exp-dim_1024_emb-dim_4096_token-num_512
Finished  exp-num_16_exp-dim_1024_emb-dim_4096_token-num_1024
Finished  exp-num_16_exp-dim_1024_emb-dim_4096_token-num_2048
Finished  exp-num_16_exp-dim_1024_emb-dim_4096_token-num_4096
Finished  exp-num_16_exp-dim_1024_emb-dim_4096_token-num_8192
Finished  exp-num_16_exp-dim_2048_emb-dim_256_token-num_128
Finished  exp-num_16_exp-dim_2048_emb-dim_256_token-num_256
Finished  exp-num_16_exp-dim_2048_emb-dim_256_token-num_512
Finished  exp-num_16_exp-dim_2048_emb-dim_256_token-num_1024
Finished  exp-num_16_exp-dim_2048_emb-dim_256_token-num_2048
Finished  exp-num_16_exp-dim_2048_emb-dim_256_token-num_4096
Finished  exp-num_16_exp-dim_2048_emb-dim_256_token-num_8192
Finished  exp-num_16_exp-dim_2048_emb-dim_512_token-num_128
Finished  exp-num_16_exp-dim_2048_emb-dim_512_token-num_256
Finished  exp-num_16_exp-dim_2048_emb-dim_512_token-num_512
Finished  exp-num_16_exp-d