In [13]:
# imports, setup, and load mapd, torch

PWD = !pwd

import sys
import os.path
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')

# Add import path to MapD Thrift binding and Arrow schema
thirdparty_path = os.path.join(PWD[0], '..', 'thirdparty')
sys.path.append(thirdparty_path)
thirdparty_path

# If `pygdf` cannot be imported, uncomment code below:
pygdf_path = os.path.join(PWD[0], '..')
sys.path.append(pygdf_path)

import pygdf
from numba import cuda
from numba.cuda.cudadrv import drvapi
import numpy as np
from pygdf.gpuarrow import GpuArrowReader
from pygdf.dataframe import DataFrame

from thrift.protocol import TBinaryProtocol
from thrift.protocol import TJSONProtocol
from thrift.transport import TSocket
from thrift.transport import THttpClient
from thrift.transport import TTransport

from mapd import MapD
from mapd import ttypes

# MapD connection

def get_client(host_or_uri, port, http):
  if http:
    transport = THttpClient.THttpClient(host_or_uri)
    protocol = TJSONProtocol.TJSONProtocol(transport)
  else:
    socket = TSocket.TSocket(host_or_uri, port)
    transport = TTransport.TBufferedTransport(socket)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

  client = MapD.Client(protocol)
  transport.open()
  return client

db_name = 'mapd'
user_name = 'mapd'
passwd = 'HyperInteractive'
hostname = 'localhost'
portno = 9091

client = get_client(hostname, portno, False)
session = client.connect(user_name, passwd, db_name)
print('Connection complete')

import torch
from torch.autograd import Variable
import torch.nn as nn
import torchvision.transforms as transforms
import torch.utils.data as data_utils

Connection complete


In [14]:
# execute query and get gdf

LABEL_COLUMN = "income_bracket_numeric"
columns = """
    age, workclass_numeric, education_numeric, education_num, marital_status_numeric, occupation_numeric, relationship_numeric, race_numeric, gender_numeric, capital_gain, capital_loss, hours_per_week, native_country_numeric, income_bracket_numeric
""".strip()

#dictionary mapping each categorical variable to number of unique values it has
cat_uniq_dict={"education_numeric":16, "gender_numeric":2, "marital_status_numeric":7, "native_country_numeric":42, "occupation_numeric":15, "race_numeric":5, "relationship_numeric":6, "workclass_numeric":9}

mapd_table = "adult_data_numeric"
limit = 50
offset = 0

query = "SELECT {} FROM {} LIMIT {} OFFSET {};".format(columns, mapd_table, limit, offset)
#print("Getting records {}-{} from {}...".format(offset, offset + limit, mapd_table))
results = client.sql_execute_gpudf(session, query, device_id=0, first_n=-1)

# convert the results into a gpu df
ipc_handle = drvapi.cu_ipc_mem_handle(*results.df_handle)
ipch = cuda.driver.IpcHandle(None, ipc_handle, size=results.df_size)
ctx = cuda.current_context()
dptr = ipch.open(ctx)
dtype = np.dtype(np.byte)
darr = cuda.devicearray.DeviceNDArray(shape=dptr.size, strides=dtype.itemsize, dtype=dtype, gpu_data=dptr)
reader = GpuArrowReader(darr)
df = DataFrame(reader.to_dict().items())

In [15]:
# convert the gdf to a gpu matrix
print(type(df))
m = df.as_gpu_matrix()
print(m.shape)
print(type(m))

<class 'pygdf.dataframe.DataFrame'>
(50, 14)
<class 'numba.cuda.cudadrv.devicearray.DeviceNDArray'>


In [16]:
# create a torch tensor of shape m and get the pointer to it
from numba import cuda
import numpy
import math
import ctypes

# simple cuda copy kernel
@cuda.jit
def test(in_array, out_ptr):
  x, y = cuda.grid(2)
  if x < in_array.shape[0] and y < in_array.shape[1]:
    out_ptr[x, y] = in_array[x, y]
    
# setting up grid and block sizes
threadsperblock = (16, 16)
blockspergrid_x = math.ceil(m.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(m.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

# TODO: this is hardcoded for 2D tensor for now
# I tried torch.LongStorage({m.shape[0],m.shape[1]}) but it didn't work
t = torch.cuda.FloatTensor(m.shape[0], m.shape[1])
print(t.size())
ptr = t.data_ptr()
ctx = cuda.current_context()
# memory pointer object for tensor data
memptr = cuda.cudadrv.driver.MemoryPointer(ctx, ctypes.c_uint64(ptr), t.numel() * t.element_size())
# ndarray representation of our tensor 
arr = cuda.devicearray.DeviceNDArray(shape=m.shape, strides=tuple(4*x for x in t.stride()), dtype=np.dtype(np.float32), gpu_data=memptr)
# launch cuda kernel
test[blockspergrid, threadsperblock](m, arr)

# check output values
m_host = m.copy_to_host()
for x in range(m.shape[0]):
    for y in range(m.shape[1]):
        assert(m_host[x, y] == t[x, y])
#print(m_host)
#print(t)

torch.Size([50, 14])


In [None]:
#convert categorical variables to one hot vectors
for k in cat_uniq_dict.keys():
    cats=np.array(range(cat_uniq_dict[k]))
    df = df.one_hot_encoding(k, prefix=k, cats=cats, dtype ='int')
    del df[k]    

# make labels
labels=df[LABEL_COLUMN].to_array()
labels=torch.from_numpy(labels)

#make features
df.drop_column(LABEL_COLUMN)
features=df.as_matrix()
features=torch.from_numpy(features)

# clean up the IPC handle
ipch.close()