# HIGGS example using IBM PowerAI Snap ML

In this example we will train a Logistic Regression model on the HIGGS dataset, using both scikit-learn and snap-ml-local.

The HIGGS dataset is avaliable in the UCI machine learning repository.

Update device_ids list in LogisticRegression of snap_ml based on the number of GPUs available for you.

To avoid 'kernel restart' problem increase CPU and memory for the jupyter environment (e.g. memory 10GB, CPU 100) and restart it

### Download input data

Two wget commands are given below for downloading input HIGGS dataset - one for reduced dataset and another for bigger/full dataset. Many times better perfomance of snapML training is seen with bigger dataset.

You can comment the line containing the wget command to avoid downloading dataset again if running the same wget command more than once. Similarly preprocessing code can be commented out second time onwards if using the same dataset.

In [3]:
# About 3 times better training time with snapML compared to sklearn with this full HIGGS dataset
#!mkdir -p data; cd data; wget -nc https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/HIGGS.bz2; bunzip2 HIGGS.bz2; cd ../


#!mkdir -p data; cd data; wget -nc https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz; gunzip HIGGS.csv.gz; cd ../
    

# Download reduced dataset
#!mkdir -p data; cd data; wget -O HIGGS -nc https://ibm.box.com/shared/static/v684mqemrrz9o9gsko4ox5l30t6ncqag; cd ../

### Preprocess the data

In [2]:
# Preprocess the data
from sklearn.datasets import load_svmlight_file

defaultPath = "."

X,y = load_svmlight_file(defaultPath + "/data/HIGGS.csv")

# Make the train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Convert to numpy ararys
import numpy as np
X_train = np.array(X_train.todense())
X_test  = np.array(X_test.todense())

# Normalize the training data
from sklearn.preprocessing import normalize
X_train = normalize(X_train, axis=1, norm='l1')
X_test  = normalize(X_test,  axis=1, norm='l1')

# Save the dense matrices
np.save(defaultPath + "/data/HIGGS.X_train", X_train)
np.save(defaultPath + "/data/HIGGS.X_test",  X_test)

# Save the labels
np.save(defaultPath + "/data/HIGGS.y_train", y_train)
np.save(defaultPath + "/data/HIGGS.y_test", y_test)

ValueError: could not convert string to float: b'1.000000000000000000e+00,8.692932128906250000e-01,-6.350818276405334473e-01,2.256902605295181274e-01,3.274700641632080078e-01,-6.899932026863098145e-01,7.542022466659545898e-01,-2.485731393098831177e-01,-1.092063903808593750e+00,0.000000000000000000e+00,1.374992132186889648e+00,-6.536741852760314941e-01,9.303491115570068359e-01,1.107436060905456543e+00,1.138904333114624023e+00,-1.578198313713073730e+00,-1.046985387802124023e+00,0.000000000000000000e+00,6.579295396804809570e-01,-1.045456994324922562e-02,-4.576716944575309753e-02,3.101961374282836914e+00,1.353760004043579102e+00,9.795631170272827148e-01,9.780761599540710449e-01,9.200048446655273438e-01,7.216574549674987793e-01,9.887509346008300781e-01,8.766783475875854492e-01'

### Training and Evaluating a Logistic Regression Model using GPU

In [None]:
# Training and Evaluating a Logistic Regression Model using GPU
from scipy import sparse

# Load the data
import time
import numpy as np
import argparse

defaultPath = "."

t0 = time.time()
X_train = np.load(defaultPath + "/data/HIGGS.X_train.npy")
X_test  = np.load(defaultPath + "/data/HIGGS.X_test.npy")
y_train = np.load(defaultPath + "/data/HIGGS.y_train.npy")
y_test  = np.load(defaultPath + "/data/HIGGS.y_test.npy")
print("Data load time (s):  {0:.2f}".format(time.time()-t0))

# Import the LogisticRegression from snap.ml
from snap_ml import LogisticRegression
lr = LogisticRegression(use_gpu=True, max_iter=15, dual=True, num_threads=32, device_ids=[0,1])

# Training
t0 = time.time()
lr.fit(X_train, y_train)
print("[snap.ml] Training time (s):  {0:.2f}".format(time.time()-t0))

# Inference
proba_test = lr.predict_proba(X_test)

# Evaluate log-loss on test set
from sklearn.metrics import log_loss
logloss_snap = log_loss(y_test, proba_test)
print("[snap.ml] Logarithmic loss:   {0:.4f}".format(logloss_snap))

# Import the LogisticRegression from sklearn
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(fit_intercept=False, dual=True)

# Training time
t0 = time.time()
lr.fit(X_train, y_train)
print("[sklearn] Training time (s):  {0:.2f}".format(time.time()-t0))

# Inference
proba_test = lr.predict_proba(X_test)

# Evaluate log-loss on test set
logloss_sklearn = log_loss(y_test, proba_test)
print("[sklearn] Logarithmic loss:   {0:.4f}".format(logloss_sklearn))

### Training and Evaluating a Logistic Regression Model using CPU

In [None]:
# Training and Evaluating a Logistic Regression Model using CPU
from scipy import sparse

# Load the data
import time
import numpy as np
import argparse

defaultPath = "."

t0 = time.time()
X_train = np.load(defaultPath + "/data/HIGGS.X_train.npy")
X_test  = np.load(defaultPath + "/data/HIGGS.X_test.npy")
y_train = np.load(defaultPath + "/data/HIGGS.y_train.npy")
y_test  = np.load(defaultPath + "/data/HIGGS.y_test.npy")
print("Data load time (s):  {0:.2f}".format(time.time()-t0))

# Import the LogisticRegression from snap.ml
from pai4sk import LogisticRegression
lr = LogisticRegression(use_gpu=False, max_iter=15, dual=True, num_threads=32, device_ids=[])

# Training
t0 = time.time()
lr.fit(X_train, y_train)
print("[snap.ml] Training time (s):  {0:.2f}".format(time.time()-t0))

# Inference
proba_test = lr.predict_proba(X_test)

# Evaluate log-loss on test set
from sklearn.metrics import log_loss
logloss_snap = log_loss(y_test, proba_test)
print("[snap.ml] Logarithmic loss:   {0:.4f}".format(logloss_snap))

# Import the LogisticRegression from sklearn
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(fit_intercept=False, dual=True)

# Training time
t0 = time.time()
lr.fit(X_train, y_train)
print("[sklearn] Training time (s):  {0:.2f}".format(time.time()-t0))

# Inference
proba_test = lr.predict_proba(X_test)

# Evaluate log-loss on test set
logloss_sklearn = log_loss(y_test, proba_test)
print("[sklearn] Logarithmic loss:   {0:.4f}".format(logloss_sklearn))

&copy; Copyright IBM Corporation 2018