In [3]:
# quantize
# convert fp32 to int8
# ref：https://wiki.sophgo.com/pages/viewpage.action?pageId=59598822

import math
import copy
import numpy as np

data = np.random.randn(224)
fp32_data = copy.deepcopy(data)

mode = "int8"

if mode == "int16":
    upper = 32767
    lower = -32768
if mode == "int8":
    upper = 127-
    lower = -128
elif mode == "int4":
    upper = 7
    lower = -8

threshold = 1

inv_scale = upper / threshold

for i, d in enumerate(data):
    data[i] = math.floor(data[i] * inv_scale + 0.5)
    data[i] = data[i] if data[i] < upper else upper
    data[i] = data[i] if data[i] > lower else lower

In [21]:
inv_scale, max(data), min(data)

(127.0, 127.0, -128.0)

In [22]:
# dequantize
# convert int8 to fp32
mode = "int8"

threshold = 1
if mode == "int16":
    scale = threshold/32767
if mode == "int8":
    scale = threshold/127
elif mode == "int4":
    scale = threshold/7

for i, d in enumerate(data):
    data[i] = data[i] * scale

In [23]:
def cos_sim(a, b):
  """计算两个向量a和b的余弦相似度"""
  
  a = np.array(a) 
  b = np.array(b)

  inner_product = np.dot(a, b)
  # 内积
  norm_a = np.linalg.norm(a)  
  norm_b = np.linalg.norm(b)
  # 模长
  cos_sim = inner_product / (norm_a * norm_b)

  return cos_sim

cos_sim(data, fp32_data)

0.9584787901668528

In [4]:
# matrix matmul
# [1,768] x [768,1000] = [1,1000]
# int8_matrix_A = floor(127 / threshold1 * matrix_A)
# int8_matrix_B = floor(127 / threshold2 * matrix_B)
# int32_matrix_C = matmul(int8_matrix_A, int8_matrix_B)
# float32_matrix_C = (threshold1 * threshold2) / (127 * 127) * int32_matrix_C
import math
import copy
import numpy as np

def symmetrical_quantize(inputs, threshold, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    if mode == "int16":
        upper = 32767
        lower = -32768
    if mode == "int8":
        upper = 127
        lower = -128
    elif mode == "int4":
        upper = 7
        lower = -8
    inv_scale = upper / threshold
    for i, d in enumerate(data):
        data[i] = math.floor(data[i] * inv_scale + 0.5)
        data[i] = data[i] if data[i] < upper else upper
        data[i] = data[i] if data[i] > lower else lower
    return data.reshape(shape)

def symmetrical_dequantize(inputs, threshold, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    if mode == "int16":
        scale = threshold/32767
    if mode == "int8":
        scale = threshold/127
    elif mode == "int4":
        scale = threshold/7
    for i, d in enumerate(data):
        data[i] = data[i] * scale
    return data.reshape(shape)

def symmetrical_dequantize_for_matmul(inputs, threshold1, threshold2, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    if mode == "int16":
        scale = threshold1/32767 * threshold2/32767
    if mode == "int8":
        scale = threshold1/127 * threshold2/127
    elif mode == "int4":
        scale = threshold1/7 * threshold2/7
    for i, d in enumerate(data):
        data[i] = data[i] * scale
    return data.reshape(shape)

def cos_sim(a, b):
    a = np.array(a)
    b = np.array(b)
    inner_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)  
    norm_b = np.linalg.norm(b)
    cos_sim = inner_product / (norm_a * norm_b)
    return cos_sim

In [18]:
## matmul example 1
tensor = np.array([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9]])
weight = np.array([[11,12,13],[14,15,16],[17,18,19]])
result = np.matmul(tensor, weight)

print(np.max(tensor))
print(np.max(weight))
print(np.min(tensor))
print(np.min(weight))

# when the threshold = 1, the error between int8_result and result is too large
threshold1 = 1
threshold2 = 20

int8_tensor = symmetrical_quantize(tensor, threshold=threshold1)
int8_weight = symmetrical_quantize(weight, threshold=threshold2)
intermediate_result = np.matmul(int8_tensor, int8_weight)
int8_result = symmetrical_dequantize_for_matmul(intermediate_result, threshold1=threshold1, threshold2=threshold2)

print(cos_sim(result.flatten(), int8_result.flatten()))
result[:,:10], int8_result[:,:10]

0.9
19
0.1
11
0.9999955822144798


(array([[ 9. ,  9.6, 10.2],
        [21.6, 23.1, 24.6],
        [34.2, 36.6, 39. ]]),
 array([[ 8.97637795,  9.54181908, 10.2015004 ],
        [21.66780334, 23.08884618, 24.74672949],
        [34.2488685 , 36.51807304, 39.16547833]]))

In [5]:
## mul example 1
tensor = np.array([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9]])
weight = np.array([[11,12,13],[14,15,16],[17,18,19]])
result = tensor * weight

print(np.max(tensor))
print(np.max(weight))
print(np.min(tensor))
print(np.min(weight))

# when the threshold = 1, the error between int8_result and result is too large
threshold1 = 1
threshold2 = 20

int8_tensor = symmetrical_quantize(tensor, threshold=threshold1)
int8_weight = symmetrical_quantize(weight, threshold=threshold2)
intermediate_result = int8_tensor * int8_weight
int8_result = symmetrical_dequantize_for_matmul(intermediate_result, threshold1=threshold1, threshold2=threshold2)

print(cos_sim(result.flatten(), int8_result.flatten()))
result[:,:10], int8_result[:,:10]

0.9
19
0.1
11
0.9999972695330366


(array([[ 1.1,  2.4,  3.9],
        [ 5.6,  7.5,  9.6],
        [11.9, 14.4, 17.1]]),
 array([[ 1.12840226,  2.35600471,  3.91096782],
        [ 5.62837126,  7.53921508,  9.61249922],
        [11.91890384, 14.41874884, 17.10459421]]))

In [19]:
int8_tensor

array([[ 13.,  25.,  38.],
       [ 51.,  64.,  76.],
       [ 89., 102., 114.]])

In [80]:
tensor = np.random.randn(1,768)
weight = np.random.randn(768,1000)
result = np.matmul(tensor, weight)

print(np.max(tensor))
print(np.max(weight))
print(np.min(tensor))
print(np.min(weight))

# when the threshold = 1, the error between int8_result and result is too large
threshold1 = 5
threshold2 = 5

int8_tensor = symmetrical_quantize(tensor, threshold=threshold1)
int8_weight = symmetrical_quantize(weight, threshold=threshold2)
intermediate_result = np.matmul(int8_tensor, int8_weight)
int8_result = symmetrical_dequantize_for_matmul(intermediate_result, threshold1=threshold1, threshold2=threshold2)

print(cos_sim(result.flatten(), int8_result.flatten()))
result[:,:10], int8_result[:,:10]

2.988993216483128
4.458080497811677
-2.9289277392894566
-5.147599178197374
0.9998726066226508


(array([[-73.75150982,  -4.31813058,  12.57974629, -10.68935737,
          10.77199455, -13.19095254, -17.71379844,  -8.98059361,
           8.31790299, -46.41506526]]),
 array([[-74.75664951,  -4.48105896,  11.92417385, -10.75547151,
          10.57722115, -12.57827516, -17.96298593,  -9.61311923,
           7.98561597, -46.09399219]]))

In [13]:
# matrix add
# [1,197,768] + [1,197,768] = [1,197,768]
# int8_matrix_A = floor(127 / threshold1 * matrix_A)
# int8_matrix_B = floor(127 / threshold2 * matrix_B)
# int32_matrix_C = int8_matrix_A + int8_matrix_B
# float32_matrix_C = (threshold1/127 + threshold2/127)/2 * int32_matrix_C
import math
import copy
import numpy as np

def symmetrical_dequantize_for_add(inputs, threshold1, threshold2, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    if mode == "int16":
        scale = 1/(32767/threshold1 + 32767/threshold2)
    if mode == "int8":
        scale = (threshold1/127 + threshold2/127)/2
    elif mode == "int4":
        scale = 1/(7/threshold1 + 7/threshold2)
    for i, d in enumerate(data):
        data[i] = data[i] * scale
    return data.reshape(shape)

tensor = np.random.randn(1,197,768)
weight = np.random.randn(1,197,768) - 2
result = tensor + weight # add

print(np.max(tensor))
print(np.max(weight))
print(np.min(tensor))
print(np.min(weight))

threshold1 = 5
threshold2 = 6

int8_tensor = symmetrical_quantize(tensor, threshold=threshold1)
int8_weight = symmetrical_quantize(weight, threshold=threshold2)
intermediate_result = int8_tensor + int8_weight # add
int8_result = symmetrical_dequantize_for_add(intermediate_result, threshold1=threshold1, threshold2=threshold2)

print(cos_sim(result.flatten(), int8_result.flatten()))
result[0,0,:10], int8_result[0,0,:10]

4.317824502061027
2.6988657048137092
-4.623954706232497
-6.359733633120778
0.997362705519197


(array([-1.02138361, -2.85443906, -0.91101856, -1.36218677, -3.45801758,
        -2.88252643, -1.05356366, -1.41004089,  1.6032474 , -1.81443685]),
 array([-0.38654259, -1.35289907, -0.40801718, -0.68718683, -1.63206872,
        -1.26700072, -0.32211883, -0.60128848,  0.88045812, -0.77308518]))

In [22]:
## add example 1
tensor = np.array([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9]])
weight = np.array([[11,12,13],[14,15,16],[17,18,19]])
result = tensor + weight

print(np.max(tensor))
print(np.max(weight))
print(np.min(tensor))
print(np.min(weight))

# when the threshold = 1, the error between int8_result and result is too large
threshold1 = 1
threshold2 = 20

int8_tensor = symmetrical_quantize(tensor, threshold=threshold1)
int8_weight = symmetrical_quantize(weight, threshold=threshold2)
intermediate_result = int8_tensor + int8_weight
int8_result = symmetrical_dequantize_for_add(intermediate_result, threshold1=threshold1, threshold2=threshold2)

print(cos_sim(result.flatten(), int8_result.flatten()))
result[:,:10], int8_result[:,:10]

0.9
19
0.1
11
0.9929282921942728


(array([[11.1, 12.2, 13.3],
        [14.4, 15.5, 16.6],
        [17.7, 18.8, 19.9]]),
 array([[0.6224222 , 0.75740532, 0.90738658],
        [1.04986877, 1.19235096, 1.33483315],
        [1.47731534, 1.61979753, 1.76227972]]))

In [103]:
# matrix sqrt
# sqrt([1,197,768]) = [1,197,768]
# int8_matrix_A = floor(255 / threshold1 * matrix_A)
# int32_matrix_C = sqrt(int8_matrix_A)
# float32_matrix_C = sqrt(threshold1/127) * int32_matrix_C
import math
import copy
import numpy as np

def symmetrical_quantize_for_sqrt(inputs, threshold, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    upper_coeff_dict = {"int16": 65535, "int8": 255, "int4": 15}
    upper = upper_coeff_dict[mode]
    lower = 0
    
    inv_scale = upper / threshold
    for i, d in enumerate(data):
        data[i] = math.floor(data[i] * inv_scale + 0.5)
        data[i] = data[i] if data[i] < upper else upper
        data[i] = data[i] if data[i] > lower else lower
    return data.reshape(shape)

def symmetrical_dequantize_for_sqrt(inputs, threshold1, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    coeff_dict = {"int16": 65535, "int8": 255, "int4": 15}
    scale = math.sqrt(threshold1 / coeff_dict[mode])
    data = data * scale
    return data.reshape(shape)

tensor = np.clip(np.random.randn(1,197,768),0.2,1)
result = np.sqrt(tensor) # sqrt

print(np.max(tensor))
print(np.max(weight))
print(np.min(tensor))
print(np.min(weight))

threshold1 = 5

int8_tensor = symmetrical_quantize_for_sqrt(tensor, threshold=threshold1)
intermediate_result = np.sqrt(int8_tensor) # sqrt
int8_result = symmetrical_dequantize_for_sqrt(intermediate_result, threshold1=threshold1)

print(cos_sim(result.flatten(), int8_result.flatten()))
result[0,0,:10], int8_result[0,0,:10]

# TODO: use select from table to sqrt ( replace SqrtOp and DequantizeOp with TableOP)

1.0
4.542552793674177
0.2
-4.644645792432321
0.999984987341572


(array([0.4472136 , 1.        , 0.57172395, 0.4472136 , 0.4472136 ,
        0.4472136 , 0.66295482, 0.4472136 , 0.4472136 , 1.        ]),
 array([0.44280744, 1.        , 0.57735027, 0.44280744, 0.44280744,
        0.44280744, 0.65678958, 0.44280744, 0.44280744, 1.        ]))

### SQRT table

In [18]:
# matrix sqrt
# sqrt([1,197,768]) = [1,197,768]
# int8_matrix_A = floor(255 / threshold1 * matrix_A)
# int32_matrix_C = sqrt(int8_matrix_A)
# float32_matrix_C = sqrt(threshold1/127) * int32_matrix_C
import math
import copy
import numpy as np

def symmetrical_quantize_for_sqrt(inputs, threshold, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    upper_coeff_dict = {"int16": 65535, "int8": 255, "int4": 15}
    upper = upper_coeff_dict[mode]
    lower = 0
    
    inv_scale = upper / threshold
    for i, d in enumerate(data):
        data[i] = math.floor(data[i] * inv_scale + 0.5)
        data[i] = data[i] if data[i] < upper else upper
        data[i] = data[i] if data[i] > lower else lower
    return data.reshape(shape)

def symmetrical_dequantize_for_sqrt(inputs, threshold1, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    coeff_dict = {"int16": 65535, "int8": 255, "int4": 15}
    # import pdb;pdb.set_trace()
    scale = math.sqrt(threshold1 / coeff_dict[mode])
    data = data * scale
    return data.reshape(shape)

def symmetrical_dequantize_for_sqrt_table(inputs, threshold1, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    coeff_dict = {"int16": 65535, "int8": 255, "int4": 15}
    
    scale = math.sqrt(threshold1 / coeff_dict[mode])
    table = [np.sqrt(d) * scale for d in range(0,256)]
    data = np.array([table[i] for i in data.astype(np.int8)])
    return data.reshape(shape)

tensor = np.array([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9]])
result = np.sqrt(tensor) # sqrt


print(np.max(tensor))
print(np.max(weight))
print(np.min(tensor))
print(np.min(weight))

threshold1 = 5

int8_tensor = symmetrical_quantize_for_sqrt(tensor, threshold=threshold1)
int8_result = symmetrical_dequantize_for_sqrt_table(int8_tensor, threshold1=threshold1)

print(cos_sim(result.flatten(), int8_result.flatten()))
result[:], int8_result[:]

# TODO: use select from table to sqrt ( replace SqrtOp and DequantizeOp with TableOP)

0.9
19
0.1
11
0.9999796929976292


(array([[0.31622777, 0.4472136 , 0.54772256],
        [0.63245553, 0.70710678, 0.77459667],
        [0.83666003, 0.89442719, 0.9486833 ]]),
 array([[0.31311215, 0.44280744, 0.54232614],
        [0.62622429, 0.71400555, 0.77964295],
        [0.84016805, 0.89661673, 0.94971616]]))

In [16]:
int8_tensor

array([[ 5., 10., 15.],
       [20., 26., 31.],
       [36., 41., 46.]])

In [11]:
int8_tensor

array([[ 5., 10., 15.],
       [20., 26., 31.],
       [36., 41., 46.]])

### GELU table

原理：用gelu (x大于1时逼近y=x这个线性函数，用线性去取代非线性)

In [84]:
# matrix sqrt
# sqrt([1,197,768]) = [1,197,768]
# int8_matrix_A = floor(255 / threshold1 * matrix_A)
# int32_matrix_C = sqrt(int8_matrix_A)
# float32_matrix_C = sqrt(threshold1/127) * int32_matrix_C
import math
import copy
import numpy as np

def symmetrical_quantize_for_gelu_table(inputs, threshold, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    upper_coeff_dict = {"int16": 65535, "int8": 127, "int4": 15}
    upper = upper_coeff_dict[mode]
    lower = -128
    
    inv_scale = upper / threshold
    for i, d in enumerate(data):
        data[i] = math.floor((data[i]) * inv_scale  + 0.5)
        data[i] = data[i] if data[i] < upper else upper
        data[i] = data[i] if data[i] > lower else lower
    return data.reshape(shape)

def gelu(x):
    return 0.5 * x * (1 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715 * np.power(x, 3))))

def symmetrical_dequantize_for_gelu_table(inputs, threshold, mode="int8"):
    shape = inputs.shape
    data = inputs.flatten()
    coeff_dict = {"int16": 65535, "int8": 127, "int4": 15}
    
    scale = threshold1 / coeff_dict[mode]
    inv_scale = 1 / scale
    table = [gelu(d * threshold/128 - threshold) for d in range(0,256)]
    data = np.array([(table[i + 128]) for i in data.astype(np.int8)])
    return data.reshape(shape)

tensor = np.array([[-9.0,-8.0,-7.0],[-1.9,-1.8,-1.7],[0.4,0.5,0.6],[-0.7,-0.8,-0.9]])
result = gelu(tensor) # gelu


print(np.max(tensor))
print(np.min(tensor))

threshold1 = 10

int8_tensor = symmetrical_quantize_for_gelu_table(tensor, threshold=threshold1)
int8_result = symmetrical_dequantize_for_gelu_table(int8_tensor, threshold=threshold1)

result[:], int8_result[:]

# TODO: use select from table to sqrt ( replace SqrtOp and DequantizeOp with TableOP)

0.6
-9.0


(array([[-0.00000000e+00, -0.00000000e+00, -2.33146835e-15],
        [-5.45487655e-02, -6.47404732e-02, -7.58941044e-02],
        [ 2.62161169e-01,  3.45714010e-01,  4.35415199e-01],
        [-1.69429865e-01, -1.69568309e-01, -1.65771511e-01]]),
 array([[-0.00000000e+00, -0.00000000e+00, -3.85975973e-15],
        [-5.70009644e-02, -6.50750546e-02, -7.37353375e-02],
        [ 2.54666078e-01,  3.18912571e-01,  4.58721150e-01],
        [-1.69501454e-01, -1.69865481e-01, -1.67738965e-01]]))

In [86]:
  for (auto i = min_th; i <= max_th; i++) {
    float step = func(i * threshold/127. - threshold);
    float tmp = std::clamp(floor(step * 255./threshold - 128.), -128., 127.);
    int8_t data = static_cast<int8_t>(tmp);
    table.push_back(data);
  }

array([[-114., -102.,  -89.],
       [ -24.,  -23.,  -22.],
       [   5.,    6.,    8.],
       [  -9.,  -10.,  -11.]])

In [17]:
import math
import numpy as np

def lowering_asymetric(data, in_scale, in_zp, upper, lower):
    for i, d in enumerate(data):
        data[i] = math.floor(data[i] / in_scale + in_zp)
        data[i] = data[i] if data[i] < upper else upper
        data[i] = data[i] if data[i] > lower else lower
    return data

def gelu(x):
    return 0.5 * x * (1 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715 * np.power(x, 3))))



def getScaleAndZeropoint(fmax, fmin, qmax, qmin):
    scale = (fmax - fmin) / (qmax - qmin)
    zp = -fmin/scale + qmin
    return scale, zp

def cos_sim(a, b):
    a = np.array(a)
    b = np.array(b)
    inner_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)  
    norm_b = np.linalg.norm(b)
    cos_sim = inner_product / (norm_a * norm_b)
    return cos_sim


def table_op(tensor, in_scale, in_zp, out_scale, out_zp, qmax, qmin):
    int8_tensor = lowering_asymetric(tensor.flatten(), in_scale, in_zp, qmax, qmin)
    print(int8_tensor.astype(np.int8))
    table = []
    for i in range(qmin,qmax+1):
        step = (i - in_zp) * in_scale
        table.append(np.clip(np.floor(gelu(step) / out_scale + out_zp), -128, 127))
    # lut
    print(len(table))
    lookup_value  = np.array([table[i+128] for i in int8_tensor.astype(np.int8)])
    return (lookup_value - out_zp) * out_scale


tensor = np.array([[-9.0,-8.0,-7.0],[-1.9,-1.8,-1.7],[49.4,49.5,49.6],[-0.7,-0.8,-0.9],[100.7,100.8,100.9]])

large_posi = np.clip(tensor, 100, 10000)

in_fmin = 0
in_fmax = 50
out_fmin = -1
out_fmax = 50
qmax = 127
qmin = -128
in_scale, in_zp = getScaleAndZeropoint(in_fmax, in_fmin, qmax, qmin)
out_scale, out_zp = getScaleAndZeropoint(out_fmax, out_fmin, qmax, qmin)
posi = table_op(np.clip(tensor, 0, 100), in_scale, in_zp, out_scale, out_zp, qmax, qmin)

in_fmin = -9
in_fmax = 0
out_fmin = -1
out_fmax = 50
qmax = 127
qmin = -128
in_scale, in_zp = getScaleAndZeropoint(in_fmax, in_fmin, qmax, qmin)
out_scale, out_zp = getScaleAndZeropoint(out_fmax, out_fmin, qmax, qmin)
neg = table_op(np.clip(tensor, -100, 0), in_scale, in_zp, out_scale, out_zp, qmax, qmin)

[-128 -128 -128 -128 -128 -128  123  124  124 -128 -128 -128  127  127
  127]
256
[-128 -100  -72   73   76   78  127  127  127  107  104  101  127  127
  127]
256


In [18]:
cos_sim(posi + neg + large_posi, np.array([gelu(d) for d in tensor.flatten()]))

ValueError: operands could not be broadcast together with shapes (15,) (5,3) 

In [75]:
cos_sim(posi + neg, np.array([gelu(d) for d in tensor.flatten()]))

0.9999973385304844

In [11]:
gelu(3.4)

3.399094149016546

In [84]:
gelu(0.5)

0.34571400982514394

In [85]:
gelu(1)

0.8411919906082768

In [58]:
in_fmin = -9
in_fmax = 1
out_fmin = -1
out_fmax = 1
qmax = 127
qmin = -128
in_scale, in_zp = getScaleAndZeropoint(in_fmax, in_fmin, qmax, qmin)
out_scale, out_zp = getScaleAndZeropoint(out_fmax, out_fmin, qmax, qmin)
posi = table_op(np.clip(tensor, 0, 100), in_scale, in_zp, out_scale, out_zp, qmax, qmin)

[101 101 101 101 101 101 111 114 116 101 101 101]
256


In [59]:
cos_sim(posi, np.array([gelu(d) for d in tensor.flatten()]))

0.9100494021727936

In [44]:
posi

[-0.0,
 -0.0,
 -2.3314683517128287e-15,
 -0.05454876554466734,
 -0.06474047315407029,
 -0.07589410444884379,
 0.2621611694273562,
 0.34571400982514394,
 0.43541519923081473,
 -0.16942986529488321,
 -0.1695683085635519,
 -0.16577151129027945]

array([-0.00784314, -0.00784314, -0.00784314, -0.09411765, -0.09411765,
       -0.09411765,  0.20784314,  0.29411765,  0.38039216, -0.18039216,
       -0.18039216, -0.18039216])

In [10]:
gelu(0)

0.0

In [14]:
gelu(10)

10.0

In [13]:
gelu(2550)

2550.0

In [47]:
gelu(1*8)

8.0

In [46]:
gelu(1) * gelu(8)

6.729535924866214

In [36]:
int8_tensor

array([[0.0140028 , 0.0280056 , 0.0420084 ],
       [0.0560112 , 0.070014  , 0.08401681],
       [0.09801961, 0.11202241, 0.12602521]])

In [24]:
tensor = np.array([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9]])
result = 0.5 * tensor * (1 + np.tanh(np.sqrt(2/np.pi) * (tensor + 0.044715 * np.power(tensor, 3))))

In [25]:
result

array([[0.05398275, 0.11585143, 0.18537092],
       [0.26216117, 0.34571401, 0.4354152 ],
       [0.53057013, 0.63043169, 0.73422849]])