#### 1- Multi-QPU (nvidia-mqpu)

The `nvidia-mqpu` target is useful for distributing separate quantum circuits to individual GPUs on a single host machine. 

![img](./circuit-mqpu.png)

### Example with `sample` algorithmic primitives

#### Quantum Restricted Boltzmann Machine (Q-RBM)

![img](./RBM.png)

![img](./gibbs-dist.png)



In [1]:
import cudaq

cudaq.set_target("nvidia-mqpu")

target = cudaq.get_target()
qpu_count = target.num_qpus()
print("Number of QPUs:", qpu_count)

@cudaq.kernel
def qrbm(v_nodes:int, h_nodes:int, ancilla:int, theta: list[float], coupling: list[float]):

    qubits_num=v_nodes+h_nodes+ancilla
    qubits=cudaq.qvector(qubits_num)

    # Encode the node parameters
    for i in range(v_nodes+h_nodes):
        ry(theta[i],qubits[i])

    # Encode the coupling between nodes
    a_target=v_nodes+h_nodes
    count=0
    for v in range(v_nodes):
        for h in range(v_nodes,v_nodes+h_nodes):
            ry.ctrl(coupling[count],qubits[v],qubits[h],qubits[a_target])
            x(qubits[v])
            ry.ctrl(coupling[count+1],qubits[v],qubits[h],qubits[a_target])
            x(qubits[v])
            x(qubits[h])
            ry.ctrl(coupling[count+1],qubits[v],qubits[h],qubits[a_target])
            x(qubits[v])
            ry.ctrl(coupling[count],qubits[v],qubits[h],qubits[a_target])
            x(qubits[v])
            x(qubits[h])

            count+=2
            a_target+=1

    mz(qubits)    
    
v_nodes=2
h_nodes=2
ancilla=4

# Initialize the parameters for the RBM
theta=[2.0482, 1.4329, 2.1774, 2.7122]
coupling=[1.8256, 3.1415, 1.8257, 3.1415, 3.1415, 0.4152, 3.1415, 0.9654]

count_futures = []

for qpu in range(3):
    count_futures.append(cudaq.sample_async(qrbm,v_nodes, h_nodes, ancilla, theta, coupling, shots_count=10000,qpu_id=qpu))

for counts in count_futures:
    print(counts.get())

    

Number of QPUs: 5
{ 10110111:9 00010111:26 11100111:8 00000111:3 10011111:106 10111011:3 11011111:13 10011011:62 01011101:150 11110111:597 10111100:913 10111010:20 10110001:95 11101110:55 11101111:15 10110011:5 01111111:906 00111110:44 10110110:29 11111111:935 01101110:23 11001110:2 00111111:10 00000011:4 00011110:157 01000000:1 01101010:18 01001001:2 00010110:91 11011011:11 11111011:526 01010101:76 10100111:2 01101111:6 11110011:321 11011101:374 00101001:25 10101111:3 11100110:25 01000100:1 10111001:139 11011001:202 01001000:2 00001111:5 10100101:48 10110000:329 01011111:2 10111111:13 01000001:1 10111101:242 10001111:45 10111000:561 10011010:247 01101011:8 10011110:412 00011111:34 10110010:17 10110100:541 01001100:3 10111110:40 11001100:30 11001101:8 00101011:1 00111100:869 00111101:260 10110101:141 00101111:1 00101101:39 01010111:4 10101101:84 }

{ 10110111:3 00010111:26 11100111:4 00000111:2 10011111:128 10111011:3 11011111:16 10011011:68 01011101:175 11110111:565 00001011:3 1101101

#### Example with `observe` algorithmic primitives:

In [2]:
import cudaq
from cudaq import spin
import numpy as np

np.random.seed(1)

cudaq.set_target("nvidia-mqpu")
target = cudaq.get_target()
qpu_count = target.num_qpus()
print("Number of QPUs:", qpu_count)

qubit_count = 10
sample_count = 500

ham = spin.z(0)

parameter_count = qubit_count

# Below we run a circuit for 500 different input parameters.
parameters = np.random.default_rng(13).uniform(low=0,high=1,size=(sample_count,parameter_count))

print('Parameter shape: ', parameters.shape)

@cudaq.kernel
def kernel_rx(theta:list[float]):
    qubits = cudaq.qvector(qubit_count)

    for i in range(qubit_count):
        rx(theta[i], qubits[i])

# Multi-GPU

# We split our parameters into 4 arrays since we have 4 GPUs available.
xi = np.split(parameters,4)

print('We have', parameters.shape[0],
      'parameters which we would like to execute')

print('We split this into', len(xi), 'batches of', xi[0].shape[0], ',',
      xi[1].shape[0], ',', xi[2].shape[0], ',', xi[3].shape[0])

print('Shape after splitting', xi[0].shape)
asyncresults = []


for i in range(len(xi)):
    for j in range(xi[i].shape[0]):
        qpu_id = i * 4 // len(xi)
        asyncresults.append(
            cudaq.observe_async(kernel_rx, ham, xi[i][j, :], qpu_id=qpu_id))

print('Energies from multi-GPUs')
for result in asyncresults:
    observe_result = result.get()
    got_expectation = observe_result.expectation()
    print(got_expectation)
    


Number of QPUs: 5
Parameter shape:  (500, 10)
We have 500 parameters which we would like to execute
We split this into 4 batches of 125 , 125 , 125 , 125
Shape after splitting (125, 10)
Energies from multi-GPUs
0.6487942264394656
0.9592965035114541
0.9693367711158675
0.9752157540898212
0.7901937174208915
0.5851651225539602
0.9135656129518316
0.9490337029063387
0.8509625392277542
0.7454635553957428
0.9305526875050824
0.959998285031342
0.8932189110754846
0.9155879814111777
0.9597179527710608
0.9935125945096381
0.8646527734148779
0.6439004804322485
0.5830942872137947
0.6834044937811433
0.9637192885208135
0.9989258683207686
0.8779315873648825
0.8767961513094958
0.6468637421465269
0.9239566569275575
0.7861546173171964
0.9656727214945597
0.9778974511307598
0.5407711059133851
0.8048163873900496
0.9950455390681953
0.8951334235851304
0.6489412249260302
0.9690827240813413
0.9999714740669639
0.9928629044231196
0.9969285794959803
0.7175125779993621
0.7025735612926575
0.5680724042480292
0.990386642

## Batch the Spin Hamiltonian terms:

![img](./ham-batch.png)

In [3]:
import cudaq
from cudaq import spin

import timeit

cudaq.set_target("nvidia-mqpu")

#cudaq.mpi.initialize()

qubit_count = 22
term_count = 100000

@cudaq.kernel
def batch_ham():
    qubits=cudaq.qvector(qubit_count)
    h(qubits[0])
    for i in range(1, qubit_count):
        x.ctrl(qubits[0], qubits[i])

# We create a random Hamiltonian
hamiltonian = cudaq.SpinOperator.random(qubit_count, term_count)

# The observe calls allows us to calculate the expectation value of the Hamiltonian with respect to a specified kernel.

start_time = timeit.default_timer()
# Single node, single GPU.
result = cudaq.observe(batch_ham, hamiltonian).expectation()
end_time = timeit.default_timer()
print('Elapsed time (s) for single-GPU: ', end_time-start_time)

# If we have multiple GPUs/ QPUs available, we can parallelize the workflow with the addition of an argument in the observe call.

start_time = timeit.default_timer()
# Single node, multi-GPU.
result = cudaq.observe(batch_ham, hamiltonian, execution=cudaq.parallel.thread).expectation()
end_time = timeit.default_timer()
print('Elapsed time (s) for multi-GPU: ', end_time-start_time)


# Multi-node, multi-GPU. (if included use mpirun -np n filename.py)
#result = cudaq.observe(batch_ham, hamiltonian, execution=cudaq.parallel.mpi).expectation()

#cudaq.mpi.finalize()

Elapsed time (s) for single-GPU:  2.3008233257569373
Elapsed time (s) for multi-GPU:  5.22126400610432


#### 2. Multi-GPU (nvidia-mgpu)

The `nvidia-mgpu` backend is useful for running a large single quantum circuit spread across multiple GPUs.
- A $n$ qubit quantum state has $2^n$ complex amplitudes, each of which require 8 bytes of memory to store. Hence the total memory required to store a n qubit quantum state is $8$ bytes $\times 2^n$. For $n=30$ qubits, this is roughly $8$ GB but for $n=40$, this exponentially increases to $8700$ GB.

#### Example: GHZ

```python
# mpirun -np 4 python <fname> --target nvidia-mgpu

import cudaq

cudaq.mpi.initialize()

qubit_count = 30

@cudaq.kernel
def kernel(qubit_num: int):
    # Allocate our qubits.
    qvector = cudaq.qvector(qubit_num)
    # Place the first qubit in the superposition state.
    h(qvector[0])
    # Loop through the allocated qubits and apply controlled-X,
    # or CNOT, operations between them.
    for qubit in range(qubit_num - 1):
        x.ctrl(qvector[qubit], qvector[qubit + 1])
    # Measure the qubits.
    mz(qvector)

#print("Preparing GHZ state for", qubit_count, "qubits.")
counts = cudaq.sample(kernel, qubit_count)

if cudaq.mpi.rank() == 0:
    print(counts)

cudaq.mpi.finalize()
```

In [None]:
!mpirun -np 4 python ghz.py --target nvidia-mgpu