# Set-up: Connect to Drive & Import

In [1]:
!git clone https://github.com/eden-chung/RNEAPytorch.git

Cloning into 'RNEAPytorch'...
remote: Enumerating objects: 106, done.[K
remote: Counting objects: 100% (106/106), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 106 (delta 26), reused 91 (delta 14), pack-reused 0[K
Receiving objects: 100% (106/106), 389.28 KiB | 13.90 MiB/s, done.
Resolving deltas: 100% (26/26), done.


In [2]:
%cd RNEAPytorch

/content/RNEAPytorch


# Set-up: Install CuPy and Relevant Libraries

In [3]:
import numpy as np
from timeit import default_timer as timer
import torch

# Set-up: Extract Vectors/Matrices from Robot Object

In [4]:
from URDFParser import URDFParser
from URDFParser import Robot
from util import parseInputs, printUsage, validateRobot, initializeValues, printErr
from RBDReference import RBDReference
from GRiDCodeGenerator import GRiDCodeGenerator

In [5]:
parser = URDFParser()
robot = parser.parse('/content/RNEAPytorch/URDFParser/iiwa.urdf')

validateRobot(robot)

reference = RBDReference(robot)
#n is the number of joints
q, qd, u, n = initializeValues(robot, MATCH_CPP_RANDOM = True)

print("q", q)
print("qd", qd)
print("u", u)
print("n", n)

Link [base] does not have an origin. Assuming this is the fixed world base frame. Else there is an error with your URDF file.
Link [base] does not have inertial properties. Assuming this is the fixed world base frame. Else there is an error with your URDF file.
------------------------------------------
Assumed Input Joint Configuration Ordering
------------------------------------------
iiwa_joint_1
iiwa_joint_2
iiwa_joint_3
iiwa_joint_4
iiwa_joint_5
iiwa_joint_6
iiwa_joint_7
----------------------------
Total of n = 7 joints
----------------------------
q [-0.3369  1.2966 -0.6775 -1.4218 -0.7067 -0.135  -1.1495]
qd [ 0.433  -0.4216 -0.6454 -1.8605 -0.0131 -0.4583  0.7412]
u [ 0.7418  1.9284 -0.9039  0.0334  1.1799 -1.946   0.3287]
n 7


In [6]:
#initialize robot matrices & vectors
parent_id_arr = []
S_arr = []
Imat_arr = []
for ind in range(n):
  parent_id_arr.append(robot.get_parent_id(ind))
  S_arr.append(robot.get_S_by_id(ind).astype(np.float64))
  Imat_arr.append(robot.get_Imat_by_id(ind))

parent_id_arr = np.array(parent_id_arr)
print("parent_id_arr shape:", parent_id_arr.shape)
S_arr = np.array(S_arr)
print("S_arr shape:", S_arr.shape)
Imat_arr = np.array(Imat_arr)
print("Imat_arr shape:", Imat_arr.shape)

parent_id_arr shape: (7,)
S_arr shape: (7, 6)
Imat_arr shape: (7, 6, 6)


In [7]:
#write xmat functions to file
import array
import sys
import os
import inspect

for ind in range(n):
  with open(f'/content/xmat{ind}.py', 'w') as f:
      original_stdout = sys.stdout
      sys.stdout = f
      try:
          print("from numpy import array, sin, cos")
          print()
          content = robot.get_Xmat_Func_by_id(ind)
          source_code = inspect.getsource(content)
          print(source_code)
          # print(content)
      finally:
          sys.stdout = original_stdout
          f.close()

In [8]:
#store xmat functions into xmat array
py_file_location = "/content"
sys.path.append(os.path.abspath(py_file_location))
import xmat0, xmat1, xmat2, xmat3, xmat4, xmat5, xmat6

xmat_func_arr = []
xmat_func_arr.append(xmat0._lambdifygenerated(q[ind]))
xmat_func_arr.append(xmat1._lambdifygenerated(q[ind]))
xmat_func_arr.append(xmat2._lambdifygenerated(q[ind]))
xmat_func_arr.append(xmat3._lambdifygenerated(q[ind]))
xmat_func_arr.append(xmat4._lambdifygenerated(q[ind]))
xmat_func_arr.append(xmat5._lambdifygenerated(q[ind]))
xmat_func_arr.append(xmat6._lambdifygenerated(q[ind]))

xmat_func_arr = np.array(xmat_func_arr)
print("xmat_func_arr shape:", xmat_func_arr.shape)

xmat_func_arr shape: (7, 6, 6)


# Helper functions for RNEA: cross_operator, mxS, vxIv

# **Cross Operator**

Batched Cross Operator function

In [9]:
def cross_operator_batched(d_vec, d_output):

      d_output[0, 1, :] = -d_vec[2, :]
      d_output[0, 2, :] = d_vec[1, :]
      d_output[1, 0, :] = d_vec[2, :]
      d_output[1, 2, :] = -d_vec[0, :]
      d_output[2, 0, :] = -d_vec[1, :]
      d_output[2, 1, :] = d_vec[0, :]

      d_output[3, 1, :] = -d_vec[5, :]
      d_output[3, 2, :] = d_vec[4, :]
      d_output[3, 4, :] = -d_vec[2, :]
      d_output[3, 5, :] = d_vec[1, :]
      d_output[4, 0, :] = d_vec[5, :]
      d_output[4, 2, :] = -d_vec[3, :]
      d_output[4, 3, :] = d_vec[2, :]
      d_output[4, 5, :] = -d_vec[0, :]
      d_output[5, 0, :] = -d_vec[4, :]
      d_output[5, 1, :] = d_vec[3, :]
      d_output[5, 3, :] = -d_vec[1, :]
      d_output[5, 4, :] = d_vec[0, :]

In [10]:
# COMPARING to batched
batch_size = 100
h_vec_batched = np.ones((6,batch_size),  dtype=np.float64)
h_output_batched = np.zeros((6, 6, batch_size), dtype=np.float64)

#on CPU
cross_operator_batched(h_vec_batched, h_output_batched) #warm-up once
print("cross operator output shape: ", h_output_batched.shape)
startnext = timer()
for i in range(100):
  cross_operator_batched(h_vec_batched, h_output_batched)
print("CPU Batched No JIT: " + str(timer() - startnext))

cross operator output shape:  (6, 6, 100)
CPU Batched No JIT: 0.0035563180000082184


# **mxS**

In [20]:
def mxS_numpy(S, vec, vec_output, mxS_output, alpha):
        # returns the spatial cross product between vectors S and vec. vec=[v0, v1 ... vn] and S = [s0, s1, s2, s3, s4, s5]
        # derivative of spatial motion vector = v x m

        if alpha is None:
          alpha = 1
        cross_operator_batched(vec, vec_output)
        mxS_output = alpha * np.sum(vec_output * S, axis=1)


In [21]:
def mxS_pytorch(S, vec, vec_output, mxS_output=None, alpha=None):
    # returns the spatial cross product between vectors S and vec. vec=[v0, v1 ... vn] and S = [s0, s1, s2, s3, s4, s5]
    # derivative of spatial motion vector = v x m

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if alpha is None:
        alpha = 1.0
    else:
        alpha = torch.tensor(alpha, device=device)

    S = torch.as_tensor(S, device=device)
    vec = torch.as_tensor(vec, device=device)
    vec_output = torch.as_tensor(vec_output, device=device)
    if mxS_output is not None:
        mxS_output = torch.as_tensor(mxS_output, device=device)

    cross_operator_batched(vec, vec_output)

    mxS_output = alpha * torch.sum(vec_output * S, dim=1)

    return mxS_output

**Testing the Batched mxS Method on CPU vs. GPU with CuPy**

In [22]:
# COMPARING to batched
batch_size = 100
# vec is a 6 by 1 matrix
h_vec_batched = np.ones((6,batch_size),  dtype=np.float64)

#S should be a 6 by 1 matrix
h_s_vec_batched = np.ones((6, 1, batch_size),  dtype=np.float64)

#vec output is a 6 by 6 matrix
h_output_batched = np.zeros((6, 6, batch_size), dtype=np.float64)

#mxS output is a 6 by 6 matrix
#TODO: mxS output should be (6, ) not (6, 6), need to change in CPU batched
h_mxS_output_batched = np.zeros((6, batch_size), dtype=np.float64)

alpha = 0.1

#CPU/with np.tensordot
mxS_numpy(h_s_vec_batched, h_vec_batched, h_output_batched, h_mxS_output_batched, alpha) #warm-up once
print("Shape of CPU mxS output: ", h_mxS_output_batched.shape)
startnext = timer()
for i in range(100):
  mxS_numpy(h_s_vec_batched, h_vec_batched, h_output_batched, h_mxS_output_batched, alpha)
print("CPU time: " + str(timer() - startnext))

mxS_pytorch(h_s_vec_batched, h_vec_batched, h_output_batched, h_mxS_output_batched, alpha) #warm-up once
print("Shape of GPU mxS output: ", h_mxS_output_batched.shape)
startnext = timer()
for i in range(100):
  mxS_pytorch(h_s_vec_batched, h_vec_batched, h_output_batched, h_mxS_output_batched, alpha)
print("GPU time: " + str(timer() - startnext))


Shape of CPU mxS output:  (6, 100)
CPU time: 0.0036302350000028127
Shape of GPU mxS output:  (6, 100)
GPU time: 0.06895751199999722


# vxIv

**Batched vxIv Method**

In [25]:
def vxIv_numpy(vec, Imat, res, batch_size):
        temp = np.sum(Imat * vec[:, np.newaxis, :], axis=1)

        temp = np.asarray(temp).reshape(-1)


        vecXIvec = np.zeros((6, batch_size), dtype=np.float64)
        vec = np.asarray(vec)

        vecXIvec[0] = -vec[2]*temp[1]   +  vec[1]*temp[2] + -vec[2+3]*temp[1+3] +  vec[1+3]*temp[2+3]
        vecXIvec[1] =  vec[2]*temp[0]   + -vec[0]*temp[2] +  vec[2+3]*temp[0+3] + -vec[0+3]*temp[2+3]
        vecXIvec[2] = -vec[1]*temp[0]   +  vec[0]*temp[1] + -vec[1+3]*temp[0+3] + vec[0+3]*temp[1+3]
        vecXIvec[3] = -vec[2]*temp[1+3] +  vec[1]*temp[2+3]
        vecXIvec[4] =  vec[2]*temp[0+3] + -vec[0]*temp[2+3]
        vecXIvec[5] = -vec[1]*temp[0+3] +  vec[0]*temp[1+3]
        res = vecXIvec

In [26]:
def vxIv_pytorch(vec, Imat, res, batch_size):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    vec = torch.from_numpy(vec).to(device)
    Imat = torch.from_numpy(Imat).to(device)
    res = torch.from_numpy(res).to(device) if isinstance(res, np.ndarray) else res

    temp = torch.sum(Imat * vec[:, None, :], dim=1)

    temp = temp.view(-1)

    vecXIvec = torch.zeros((6, batch_size), dtype=torch.float64, device=device)

    vecXIvec[0] = -vec[2]*temp[1]   +  vec[1]*temp[2] + -vec[2+3]*temp[1+3] +  vec[1+3]*temp[2+3]
    vecXIvec[1] =  vec[2]*temp[0]   + -vec[0]*temp[2] +  vec[2+3]*temp[0+3] + -vec[0+3]*temp[2+3]
    vecXIvec[2] = -vec[1]*temp[0]   +  vec[0]*temp[1] + -vec[1+3]*temp[0+3] +  vec[0+3]*temp[1+3]
    vecXIvec[3] = -vec[2]*temp[1+3] +  vec[1]*temp[2+3]
    vecXIvec[4] =  vec[2]*temp[0+3] + -vec[0]*temp[2+3]
    vecXIvec[5] = -vec[1]*temp[0+3] +  vec[0]*temp[1+3]
    res = vecXIvec

    return res.cpu().numpy()

**vxIv Batch Testing, Comparing to CPU and GPU with CuPy**

In [27]:
# COMPARING to batched
batch_size = 100


h_vec_batched = np.ones((6, batch_size),  dtype=np.float64)
h_I_batched = np.ones((6, 6, batch_size),  dtype=np.float64)
h_output_batched = np.zeros((6, batch_size), dtype=np.float64)
h_mxS_output_batched = np.zeros((6, batch_size), dtype=np.float64)

alpha = 0.1

#CPU
vxIv_numpy(h_vec_batched, h_I_batched, h_output_batched, batch_size) #warm-up once
print("vxIV shape: ", h_output_batched.shape)
#testing in loop of 100
startnext = timer()
for i in range(100):
 vxIv_numpy(h_vec_batched, h_I_batched, h_output_batched, batch_size)
print("CPU time: " + str(timer() - startnext))


#GPU
vxIv_pytorch(h_vec_batched, h_I_batched, h_output_batched, batch_size) #warm-up once
print("vxIV shape: ", h_output_batched.shape)
#testing in loop of 100
startnext = timer()
for i in range(100):
 vxIv_pytorch(h_vec_batched, h_I_batched, h_output_batched, batch_size)
print("GPU time: " + str(timer() - startnext))

vxIV shape:  (6, 100)
CPU time: 0.007457088999899497
vxIV shape:  (6, 100)
GPU time: 0.07697395200000301


# RNEA: putting everything together

## Forward Pass

### numpy

In [29]:
def rnea_fpass_numpy(num_joints, parent_id_arr, xmat_func_arr, S_arr, Imat_arr, crOp_output, mxS_output, vxIv_output, batch_size, q, qd, qdd = None, GRAVITY = -9.81):
        """
        Forward Pass for RNEA algorithm. Computes the velocity and acceleration of each body in the tree necessary to produce a certain trajectory

        OUTPUT:
        v : input qd is specifying value within configuration space with assumption of one degree of freedom.
        Output velocity is in general body coordinates and specifies motion in full 6 degrees of freedom
        """

        n = num_joints

        v = np.zeros((6,n, batch_size))
        a = np.zeros((6,n, batch_size))
        f = np.zeros((6,n, batch_size))

        gravity_vec = np.zeros((6, batch_size))
        gravity_vec[5] = -GRAVITY


        for ind in range(n):
            parent_ind = parent_id_arr[ind]


            Xmat = xmat_func_arr[ind, :, :, :]
            S = S_arr[ind]

            if parent_ind == -1:
                a[:, ind, :] = np.sum(Xmat*gravity_vec[:, np.newaxis, :], axis=1)
            else:
                v[:, ind, :] = np.sum(Xmat*v[:, ind, :], axis=1)
                a[:, ind, :] = np.sum(Xmat*a[:, ind, :], axis=1)

            v[:,ind, :] += S*qd[ind]

            mxS_numpy(S,v[:,ind, :], crOp_output, mxS_output, qd[ind])
            a[:, ind, :] += mxS_output
            if qdd is not None:
                a[:,ind, :] += S*qdd[ind]

            Imat = Imat_arr[ind, :, :]

            temp = np.sum(Imat*a[:, ind, :], axis=1)
            vxIv_numpy(v[:,ind, :],Imat, vxIv_output, batch_size)
            f[:, ind, :] = temp + vxIv_output

        return (v,a,f)

In [44]:
def rnea_fpass_pytorch(num_joints, parent_id_arr, xmat_func_arr, S_arr, Imat_arr, crOp_output, mxS_output, vxIv_output, batch_size, q, qd, qdd=None, GRAVITY=-9.81):
    """
    Forward Pass for RNEA algorithm. Computes the velocity and acceleration of each body in the tree necessary to produce a certain trajector

    OUTPUT:
    v : input qd is specifying value within configuration space with assumption of one degree of freedom.
    Output velocity is in general body coordinates and specifies motion in full 6 degrees of freedom
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    n = num_joints

    v = torch.zeros((6, n, batch_size), device=device)
    a = torch.zeros((6, n, batch_size), device=device)
    f = torch.zeros((6, n, batch_size), device=device)

    gravity_vec = torch.zeros((6, batch_size), device=device)
    gravity_vec[5, :] = GRAVITY

    parent_id_arr = torch.from_numpy(parent_id_arr).to(device)
    xmat_func_arr = torch.from_numpy(xmat_func_arr).to(device)
    S_arr = torch.from_numpy(S_arr).to(device)
    Imat_arr = torch.from_numpy(Imat_arr).to(device)
    q = torch.from_numpy(q).to(device)
    qd = torch.from_numpy(qd).to(device)
    if qdd is not None:
        qdd = torch.from_numpy(qdd).to(device)


    for ind in range(n):
        parent_ind = parent_id_arr[ind]

        Xmat = xmat_func_arr[ind, :, :, :]
        S = S_arr[ind]

        if parent_ind == -1:
            a[:, ind, :] = torch.sum(Xmat * gravity_vec[:, None, :], dim=1)
        else:
            v[:, ind, :] = torch.sum(Xmat * v[:, parent_ind, :], dim=1)
            a[:, ind, :] = torch.sum(Xmat * a[:, parent_ind, :], dim=1)

        v[:,ind, :] += S*qd[ind]

        mxS_numpy(S,v[:,ind, :], crOp_output, mxS_output, qd[ind])
        a[:, ind, :] += mxS_output
        if qdd is not None:
            a[:,ind, :] += S*qdd[ind]

        Imat = Imat_arr[ind, :, :]

        temp = np.sum(Imat*a[:, ind, :], axis=1)
        vxIv_numpy(v[:,ind, :],Imat, vxIv_output, batch_size)
        f[:, ind, :] = temp + vxIv_output

        return (v,a,f)

### Testing Forward Pass

In [31]:
batch_size = 10000


h_xmat_func_arr_batched = np.repeat(xmat_func_arr[:, :, :, np.newaxis], batch_size, axis=3)
h_S_arr_batched = np.repeat(S_arr[:, :, np.newaxis], batch_size, axis=2)

h_Imat_arr_batched = np.repeat(Imat_arr[:, :, :, np.newaxis], batch_size, axis=3)
h_q_batched = np.repeat(q[:, np.newaxis], batch_size, axis=1)
h_qd_batched = np.repeat(qd[:, np.newaxis], batch_size, axis=1)

h_crOp_output_batched = np.zeros((6, 6, batch_size), dtype=np.float64)
h_mxS_output_batched = np.zeros((6, batch_size), dtype=np.float64)
h_vxIv_output_batched = np.zeros((6, batch_size), dtype=np.float64)




In [45]:
itr = 100

v,a,f = rnea_fpass_numpy(n, parent_id_arr, h_xmat_func_arr_batched, h_S_arr_batched,
                         h_Imat_arr_batched, h_crOp_output_batched,
                         h_mxS_output_batched, h_vxIv_output_batched, batch_size,
                         h_q_batched, h_qd_batched, qdd = None, GRAVITY = -9.81)

startnext = timer()
for i in range(itr):
  rnea_fpass_numpy(n, parent_id_arr, h_xmat_func_arr_batched, h_S_arr_batched,
                         h_Imat_arr_batched, h_crOp_output_batched,
                         h_mxS_output_batched, h_vxIv_output_batched, batch_size,
                         h_q_batched, h_qd_batched, qdd = None, GRAVITY = -9.81)
print("CPU batched fpass with numpy: " + str((timer() - startnext)))

v,a,f = rnea_fpass_pytorch(n, parent_id_arr, h_xmat_func_arr_batched, h_S_arr_batched,
                         h_Imat_arr_batched, h_crOp_output_batched,
                         h_mxS_output_batched, h_vxIv_output_batched, batch_size,
                         h_q_batched, h_qd_batched, qdd = None, GRAVITY = -9.81)

startnext = timer()
for i in range(itr):
  rnea_fpass_pytorch(n, parent_id_arr, h_xmat_func_arr_batched, h_S_arr_batched,
                         h_Imat_arr_batched, h_crOp_output_batched,
                         h_mxS_output_batched, h_vxIv_output_batched, batch_size,
                         h_q_batched, h_qd_batched, qdd = None, GRAVITY = -9.81)
print("GPU batched fpass with numpy: " + str((timer() - startnext)))

CPU batched fpass with numpy: 3.217702969999891


TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

### Numpy



In [46]:
def rnea_bpass_numpy(S_arr, parent_id_arr, xmat_func_arr, q, qd, f, USE_VELOCITY_DAMPING = False):
        # allocate memory
        n = len(q)
        c = np.zeros((n, batch_size))

        for ind in range(n-1,-1,-1):
            S = S_arr[ind]

            c[ind, :] = np.sum(S*f[:, ind, :], axis=0)

            parent_ind = parent_id_arr[ind]
            if parent_ind != -1:
                Xmat = xmat_func_arr[ind, :, :]
                temp = np.sum(Xmat*f[:, ind, np.newaxis, :], axis=1)
                f[:,parent_ind, :] = f[:,parent_ind, :] + temp



        return (c,f)

In [52]:
def rnea_bpass_pytorch(S_arr, parent_id_arr, xmat_func_arr, q, qd, f, USE_VELOCITY_DAMPING = False):
    S_arr = [torch.tensor(S, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu') for S in S_arr]
    parent_id_arr = torch.tensor(parent_id_arr, dtype=torch.int64).to('cuda' if torch.cuda.is_available() else 'cpu')
    xmat_func_arr = torch.tensor(xmat_func_arr, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')
    q = torch.tensor(q, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')
    qd = torch.tensor(qd, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')
    f = torch.tensor(f, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')

    n = len(q)
    c = torch.zeros((n, batch_size), device=f.device)

    for ind in range(n-1, -1, -1):
        S = S_arr[ind]
        c[ind, :] = torch.sum(S * f[:, ind, :], dim=0)

        parent_ind = parent_id_arr[ind]
        if parent_ind != -1:
            Xmat = xmat_func_arr[ind, :, :]
            temp = torch.sum(Xmat * f[:, ind, :].unsqueeze(2), dim=1)

            # Ensure temp is correctly shaped for addition
            temp = temp.transpose(0, 1)  # Swap the dimensions if necessary
            f[:, parent_ind, :] += temp

    return (c, f)


### Testing Backward Pass

In [53]:
itr = 100
#for reference, batch_size is 10000 above

#first run Numpy fpass
v,a,f = rnea_fpass_numpy(n, parent_id_arr, h_xmat_func_arr_batched, h_S_arr_batched,
                         h_Imat_arr_batched, h_crOp_output_batched,
                         h_mxS_output_batched, h_vxIv_output_batched, batch_size,
                         h_q_batched, h_qd_batched, qdd = None, GRAVITY = -9.81)

#wardm-up once
c, f = rnea_bpass_numpy(h_S_arr_batched, parent_id_arr, h_xmat_func_arr_batched, h_q_batched, h_qd_batched, f, USE_VELOCITY_DAMPING = False)

startnext = timer()
for i in range(itr):
  rnea_bpass_numpy(h_S_arr_batched, parent_id_arr, h_xmat_func_arr_batched, h_q_batched, h_qd_batched, f, USE_VELOCITY_DAMPING = False)
print("CPU Batched bpass with numpy: " + str((timer() - startnext)))


c, f = rnea_bpass_pytorch(h_S_arr_batched, parent_id_arr, h_xmat_func_arr_batched, h_q_batched, h_qd_batched, f, USE_VELOCITY_DAMPING = False)

startnext = timer()
for i in range(itr):
  rnea_bpass_pytorch(h_S_arr_batched, parent_id_arr, h_xmat_func_arr_batched, h_q_batched, h_qd_batched, f, USE_VELOCITY_DAMPING = False)
print("GPU Batched bpass with numpy: " + str((timer() - startnext)))



CPU Batched bpass with numpy: 0.4896213300000909


RuntimeError: The size of tensor a (6) must match the size of tensor b (10000) at non-singleton dimension 1

## Full RNEA (numpy)

In [None]:
def rnea_numpy(q, qd, qdd = None, GRAVITY = -9.81, USE_VELOCITY_DAMPING = False):

      # first do the forward pass
      v,a,f = rnea_fpass_numpy(n, parent_id_arr, h_xmat_func_arr_batched, h_S_arr_batched,
                         h_Imat_arr_batched, h_crOp_output_batched,
                         h_mxS_output_batched, h_vxIv_output_batched, batch_size,
                         h_q_batched, h_qd_batched, qdd = None, GRAVITY = -9.81)

      # then do the backward pass
      (c,f) = rnea_bpass_numpy(h_S_arr_batched, parent_id_arr,h_xmat_func_arr_batched,
                               h_q_batched, h_qd_batched,
                               f, USE_VELOCITY_DAMPING = False)
      return (c,v,a,f)

# Testing Full RNEA: Numpy

In [None]:
#startnext = timer()
for i in range(itr):
    print(rnea_numpy(q, qd, qdd = None, GRAVITY = -9.81, USE_VELOCITY_DAMPING = False))
#print("CPU Batched RNEA: " + str((timer() - startnext)))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

       [[0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        ...,
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]],

       [[0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        ...,
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]],

       [[9.81, 9.81, 9.81, ..., 9.81, 9.81, 9.81],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        ...,
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
        [0.  , 0.  , 0. 