In [None]:
%pylab inline
import pandas as pd
import numpy as np
!nvidia-smi
import sys
print(sys.path)
import py3nvml ## pip install -e git+https://github.com/fbcotter/py3nvml#egg=py3nvml
import subprocess

In [None]:
import h2o4gpu as h2o4gpu
maxNGPUS = int(subprocess.check_output("nvidia-smi -L | wc -l", shell=True))
print("\nNumber of GPUS:", maxNGPUS)

In [None]:
!lscpu

In [None]:
from os.path import expanduser
home = expanduser("~")
train_file = home + "/h2o4gpu-benchmarks/Data/Homesite/train.csv"
test_file = home + "/h2o4gpu-benchmarks/Data/Homesite/test.csv"

In [None]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [None]:
train.drop(['QuoteConversion_Flag'], axis=1,inplace=True)

In [None]:
dataset = pd.concat([train,test], ignore_index = True)
tmp = dataset.dtypes.reset_index().rename(columns = {0:"type"})#["index"]
indx = tmp["type"] == "object"
categoricals = tmp[indx]["index"].tolist()

#replace nans as new category
for col in dataset.columns:
    dataset[col] = dataset[col].fillna("__NA__")

#encode unfreq categories
for col in categoricals:
    val_dict = dataset[col].value_counts()
    val_dict = dataset[col].value_counts().reset_index()
    indx = val_dict[col] < 100
    res = val_dict[indx]["index"].tolist()
    indx = dataset[col].isin(res)
    vals = dataset[col].values
    vals[indx] = "___UNFREQ___"
    dataset[col] = vals
    
#encode all as freqs
for col in categoricals:
    val_dict = dataset[col].value_counts()
    val_dict = val_dict / float(dataset.shape[0])
    val_dict = val_dict.to_dict()
    dataset[col] = dataset[col].apply(lambda x: val_dict[x])
    
#replace nans as new category
for col in dataset.columns:
    dataset[col] = dataset[col].replace("__NA__",0)
    
trainenc = dataset.iloc[:train.shape[0],:].reset_index(drop = True)
testenc = dataset.iloc[train.shape[0]:,:].reset_index(drop = True)

In [None]:
trainencflt = trainenc.values.astype(np.float32)
testencflt = testenc.values.astype(np.float32)
k=1000
rows=np.shape(trainencflt)[0]
print(rows)
np.random.seed(1234)
#labels = np.random.randint(rows, size=rows) % k
import random
import numpy as np
labels=np.asarray([])
num=int(rows/k)
for x in range(0, num+1):
    if x<num:
        many=k
    else:
        many=rows%k
    labels = np.append(labels,np.asarray(random.sample(range(k), many)))
#print(labels)
print(labels.shape)

## H2O4GPU K-Means (multi-GPU)

In [None]:
n_gpus = 1
model = h2o4gpu.KMeans(n_gpus=n_gpus, n_clusters=k, tol=1e-7, max_iter=1000)
%time model.fit(trainencflt, labels)
centroids = model.cluster_centers_
#%time train_centroid_distance = model.transform(trainencflt)
#%time train_labels     = model.predict(trainencflt)

## H2O4GPU K-Means (multi-GPU with restrictions)

In [None]:
device=1 # all gpus=0 and pick device=1,2,3...
model = h2o4gpu.KMeans(n_gpus=2, gpu_id=device, n_clusters=k, init="random", tol=1e-10, max_iter=1000, verbosity=1)
%time model.fit(trainencflt)
#%time train_centroid_distance2 = model.transform(trainencflt)
#%time train_labels = model.predict(trainencflt)

#%time test_centroid_distance  = model.transform(testencflt)
#%time test_labels     = model.predict(testencflt)

## 60s for 224 iters (stopped early) ==> 268s for 1000 iters
## 51s for 216 iters (stopped early) on mr-dl10
## 49s for 165 iters (stopped early) on physics-179.umd.edu

## H2O

In [None]:
import h2o
h2o.init(strict_version_check=False)
h2otrain = h2o.import_file(train_file)
h2otest = h2o.import_file(test_file)

In [None]:
from h2o.estimators.kmeans import H2OKMeansEstimator
h2omodel = H2OKMeansEstimator(k=k, standardize=False, max_iterations=50, init="Random")
%time h2omodel.train(training_frame=h2otrain)

## 600 s for 50 iters -> 12000 secs for 1000 iters

In [None]:
#%time trainassignments=h2omodel.predict(h2otrain)
#hist(trainassignments.as_data_frame().values)

In [None]:
#%time testassignments=h2omodel.predict(h2otest)
#hist(testassignments.as_data_frame().values)

## Scikit-Learn

In [None]:
from sklearn.cluster import KMeans
model = KMeans(algorithm='full',n_clusters=k, init='random', n_init=1, n_jobs=-1, max_iter=50, tol=1e-8,verbose=1)
%time model.fit(trainencflt)
#train_assignments = model.predict(trainencflt)
#test_assignments = model.predict(testencflt)

## 58s for 50 iters -> 1160s for 1000 iters
## 111s for 50 iterations (physics-179.umd.edu with defualt pip installed scikit-learn)
## 123s for 50 iterations (physics-179.umd.edu with MKL in scikit-learn)

## TensorFlow

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random
import time
from tensorflow.contrib.learn import KMeansClustering
from tensorflow.contrib.learn import SKCompat

In [None]:
def train_input_fn():
    return (tf.constant(trainencflt, shape = [trainencflt.shape[0], trainencflt.shape[1]]), None)

In [None]:
seed = 12345
model = KMeansClustering(num_clusters=1000, 
                         initial_clusters=KMeansClustering.RANDOM_INIT, 
                         relative_tolerance=1e-8,
                         random_seed = seed)
t0 = time.time()
model.fit(input_fn = train_input_fn, steps = 100)
t1 = time.time()
print("TF time taken: %r" % (t1 - t0))

## 132s for 100 iters ==> 1320s for 1000 iters

### Note: TensorFlow detects the GPUs, but the GPUs aren't doing much compute
```
2017-06-18 17:01:26.394132: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties: 
name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:01:00.0
Total memory: 11.90GiB
Free memory: 10.48GiB
2017-06-18 17:01:26.639874: W tensorflow/stream_executor/cuda/cuda_driver.cc:523] A non-primary context 0x19507e00 exists before initializing the StreamExecutor. We haven't verified StreamExecutor works with that.
2017-06-18 17:01:26.640842: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties: 
name: GeForce GTX 1080 Ti
major: 6 minor: 1 memoryClockRate (GHz) 1.582
pciBusID 0000:02:00.0
Total memory: 10.91GiB
Free memory: 10.75GiB
2017-06-18 17:01:26.899156: W tensorflow/stream_executor/cuda/cuda_driver.cc:523] A non-primary context 0x19503f60 exists before initializing the StreamExecutor. We haven't verified StreamExecutor works with that.
2017-06-18 17:01:26.900102: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 2 with properties: 
name: GeForce GTX 1080 Ti
major: 6 minor: 1 memoryClockRate (GHz) 1.582
pciBusID 0000:03:00.0
Total memory: 10.91GiB
Free memory: 10.75GiB
2017-06-18 17:01:26.902241: I tensorflow/core/common_runtime/gpu/gpu_device.cc:961] DMA: 0 1 2 
2017-06-18 17:01:26.902254: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0:   Y Y Y 
2017-06-18 17:01:26.902259: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 1:   Y Y Y 
2017-06-18 17:01:26.902263: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 2:   Y Y Y 

```

```
Sun Jun 18 17:12:59 2017       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 381.22                 Driver Version: 381.22                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  TITAN X (Pascal)    On   | 0000:01:00.0      On |                  N/A |
|100%   40C    P2    56W / 250W |  11694MiB / 12188MiB |      1%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX 108...  On   | 0000:02:00.0     Off |                  N/A |
| 90%   33C    P8    20W / 250W |  10622MiB / 11172MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce GTX 108...  On   | 0000:03:00.0     Off |                  N/A |
| 80%   32C    P8    19W / 250W |  10622MiB / 11172MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID  Type  Process name                               Usage      |
|=============================================================================|
|    0      2309    G   /usr/lib/xorg/Xorg                             738MiB |
|    0      2747    G   compiz                                         467MiB |
|    0     25319    C   ...yenv/versions/3.6.0/envs/h2o4gpu/bin/python 10335MiB |
|    0     30919    G   ...el-token=798A69B792D7683DCFF8AA9AA8F4DFD9   141MiB |
|    1     25319    C   ...yenv/versions/3.6.0/envs/h2o4gpu/bin/python 10611MiB |
|    2     25319    C   ...yenv/versions/3.6.0/envs/h2o4gpu/bin/python 10611MiB |
+-----------------------------------------------------------------------------+

```