In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import matplotlib.pyplot as plt
import numpy as np
import cupy as cp
from my_cuda import calculateKDEs_cuda, calculateDist_cuda

import time

In [7]:
# datanum1 = 2000
# datanum2 = 10000
datanum1 = 2000
datanum2 = 100000
# datanum1 = 2000
# datanum2 = 1000000
dim = 10
# dim = 50
mean1 = np.concatenate([np.array([-1]), np.zeros((dim-1,))])
Cov1 = np.diag(np.concatenate([np.array([2.]), np.ones((dim-1,))]))
mean2 = np.concatenate([np.array([1]), np.zeros((dim-1,))])
Cov2 = np.diag(np.concatenate([np.array([1.]), np.ones((dim-1,))]))

L = cp.linalg.cholesky(cp.array(Cov1))
data1_cp = cp.matmul(cp.random.randn(datanum1, dim), L.T) + cp.array(mean1)  # data in gpu
data1 = data1_cp.get()   # data in cpu
L = cp.linalg.cholesky(cp.array(Cov2))
data2_cp = cp.matmul(cp.random.randn(datanum2, dim), L.T) + cp.array(mean2)   # data in gpu
data2 = data2_cp.get()   # data in cpu


### Kernel Density Estimation

In [8]:
# pyCUDA

h_bandwidth = 1.0
s = time.time()
K = calculateKDEs_cuda(data1, data2, h_bandwidth)
print('K.shape:', K.shape)
e = time.time()
p_kde = np.mean(K, axis=1)
print('densities:', p_kde)
print('Kernel calculation time:', e - s)

K.shape: (2000, 100000)
densities: [9.7342354e-07 2.7968634e-08 5.3161664e-08 ... 5.8065456e-08 2.0244570e-07
 3.0563712e-09]
Kernel calculation time: 1.5837373733520508


In [9]:
# CPU
def getKernels(data1, data2, h_bandwidth):
    datanum1, dim = data1.shape
    datanum2 = len(data2)
    lnps = (np.matmul(data1, data2.T) - \
        0.5*np.array([np.diag(np.matmul(data1, data1.T))]).T - \
        0.5*np.array([np.diag(np.matmul(data2, data2.T))]))/h_bandwidth - \
        0.5*dim*np.log(2*np.pi) - 0.5*dim*np.log(h_bandwidth)
    return np.exp(lnps)

h_bandwidth = 1.0
s = time.time()
K = getKernels(data1, data2, h_bandwidth)
print('K.shape:', K.shape)
e = time.time()
p_kde = np.mean(K, axis=1)
print('densities:', p_kde)
print('Kernel calculation time:', e - s)

K.shape: (2000, 100000)
densities: [9.73423679e-07 2.79686399e-08 5.31616635e-08 ... 5.80654621e-08
 2.02445697e-07 3.05637162e-09]
Kernel calculation time: 330.522901058197


In [10]:
# CuPy
def getCuPyKernels(data1_cp, data2_cp, h_bandwidth):
    datanum1, dim = data1_cp.shape
    datanum2 = len(data2_cp)
    lnps = (cp.matmul(data1_cp, data2_cp.T) - \
        0.5*cp.array([cp.diag(cp.matmul(data1_cp, data1_cp.T))]).T - \
        0.5*cp.array([cp.diag(cp.matmul(data2_cp, data2_cp.T))]))/h_bandwidth - \
        0.5*dim*cp.log(2*cp.pi) - 0.5*dim*cp.log(h_bandwidth)
    return cp.exp(lnps)

h_bandwidth = 1.0
s = time.time()
## generated data are still in gpu memory
K = getCuPyKernels(data1_cp, data2_cp, h_bandwidth)
# ============================================
# ## transfer data intp gpu memory first
# data1_cp2 = cp.array(data1)
# data2_cp2 = cp.array(data2)
# K = getCuPyKernels(data1_cp2, data2_cp2, h_bandwidth)
# ============================================
e = time.time()
p_kde = cp.mean(K, axis=1)
print('densities:', p_kde)
print('Kernel calculation time:', e - s)

OutOfMemoryError: Out of memory allocating 80,000,000,000 bytes (allocated so far: 4,353,924,096 bytes).

In [6]:
!nvidia-smi

Fri May 13 06:27:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-DGXS...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   43C    P0    51W / 300W |  18507MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-DGXS...  On   | 00000000:08:00.0 Off |                    0 |
| N/A   41C    P0    40W / 300W |      4MiB / 32768MiB |      0%      Default |
|       

### Distance Calculation

In [11]:
s = time.time()
DistSqMat = calculateDist_cuda(data1, data2)
e = time.time()

print(DistSqMat)
print(e - s)

[[27.329659  10.323825  27.219032  ...  4.9197016 19.157457   7.5071216]
 [18.909302  21.776083  29.575815  ... 33.115864  35.68752   41.647614 ]
 [29.663132  17.583088  31.917288  ... 30.42972    9.37299   27.697157 ]
 ...
 [30.94524   18.845654  45.036934  ... 31.628178  28.01748   36.294556 ]
 [31.645567  13.740578  33.56538   ... 19.403912  23.37553   28.95898  ]
 [39.54281   36.505802  57.928974  ... 30.48542   40.183876  47.062836 ]]
1.2581040859222412


In [12]:
# CuPy
def getCuPyDist(data1_cp, data2_cp):
    distSqs = -2.0*cp.matmul(data1_cp, data2_cp.T) + \
        cp.array([cp.diag(cp.matmul(data1_cp, data1_cp.T))]).T + \
        cp.array([cp.diag(cp.matmul(data2_cp, data2_cp.T))])
    return distSqs

s = time.time()
DistSqs = getCuPyDist(data1_cp, data2_cp)
e = time.time()
print(DistSqMat)
print(e - s)

OutOfMemoryError: Out of memory allocating 80,000,000,000 bytes (allocated so far: 3,232,801,280 bytes).