In [None]:
%pylab inline
import pandas as pd
import numpy as np
!nvidia-smi
import sys
print(sys.path)
import py3nvml ## pip install -e git+https://github.com/fbcotter/py3nvml#egg=py3nvml
import subprocess
import h2o4gpu

In [None]:
# Read the training and testing datasets from disk
from os.path import expanduser
home = expanduser("~")
train_file = home + "/h2o4gpu-benchmarks/Data/Homesite/train.csv"
test_file = home + "/h2o4gpu-benchmarks/Data/Homesite/test.csv"

try:
    train = pd.read_csv(train_file)
    test = pd.read_csv(test_file)
except:
    print("""
          Data is not located in specified path. \n
          Data can be downloaded from below link:\n
          https://www.kaggle.com/c/homesite-quote-conversion/data
          """)

# Choose number of clusters
k = 10

In [None]:
# Data munging step - KMeans takes only numerical values
train.drop(['QuoteConversion_Flag'], axis=1,inplace=True)
dataset = pd.concat([train,test], ignore_index = True)
tmp = dataset.dtypes.reset_index().rename(columns = {0:"type"})#["index"]
indx = tmp["type"] == "object"
categoricals = tmp[indx]["index"].tolist()

# Replace nans as new category
for col in dataset.columns:
    dataset[col] = dataset[col].fillna("__NA__")

# Encode unfreq categories
for col in categoricals:
    val_dict = dataset[col].value_counts()
    val_dict = dataset[col].value_counts().reset_index()
    indx = val_dict[col] < 100
    res = val_dict[indx]["index"].tolist()
    indx = dataset[col].isin(res)
    vals = dataset[col].values
    vals[indx] = "___UNFREQ___"
    dataset[col] = vals
    
# Encode all as freqs
for col in categoricals:
    val_dict = dataset[col].value_counts()
    val_dict = val_dict / float(dataset.shape[0])
    val_dict = val_dict.to_dict()
    dataset[col] = dataset[col].apply(lambda x: val_dict[x])
    
# Replace nans as new category
for col in dataset.columns:
    dataset[col] = dataset[col].replace("__NA__",0)
    
trainenc = dataset.iloc[:train.shape[0],:].reset_index(drop = True)
testenc = dataset.iloc[train.shape[0]:,:].reset_index(drop = True)

trainencflt = trainenc.values.astype(np.float32)
testencflt = testenc.values.astype(np.float32)

## H2O4GPU K-Means (single-GPU)

In [None]:
# Fit an H2O4GPU KMeans model with 1 GPU. 300 iterations by default.
model = h2o4gpu.KMeans(n_gpus=1, n_clusters=k)
%time model.fit(trainencflt)

#%time train_centroid_distance = model.transform(trainencflt)
#%time train_labels     = model.predict(trainencflt)

## H2O4GPU K-Means (multi-GPU)

In [None]:
# Fit an H2O4GPU KMeans model with 2 GPUs. 300 iterations by default.
model = h2o4gpu.KMeans(n_gpus=2, n_clusters=k)
%time model.fit(trainencflt)

#%time train_centroid_distance = model.transform(trainencflt)
#%time train_labels     = model.predict(trainencflt)

## Scikit-Learn

In [None]:
# Fit a SciKit Learn KMeans model with all available cores.
from sklearn.cluster import KMeans
model = KMeans(n_clusters=k, n_init=1, n_jobs=-1)
%time model.fit(trainencflt)
#train_assignments = model.predict(trainencflt)
#test_assignments = model.predict(testencflt)