#### If you haven't installed Hummingbird, do that first, by uncommenting the line below.


In [1]:
#! pip install hummingbird_ml 

#### Import Hummingbird's convert function

In [3]:
from hummingbird.ml import convert

#### Download and unzip the Year data

In [4]:
import zipfile
import urllib.request as urllib
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip'

filehandle, _ = urllib.urlretrieve(url)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
filename = zip_file_object.namelist()[0]
bytes_data = zip_file_object.open(filename).read()

#### Convert the bytes to pandas and split it up

In [5]:
import pandas as pd
from io import BytesIO
from sklearn.model_selection import train_test_split

year = pd.read_csv(BytesIO(bytes_data), header = None)

#train_size = 463715  # Note: this will extend the training time if we do the full dataset
train_size = 200000
X = year.iloc[:, 1:]
y = year.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, train_size=train_size, test_size=51630)

# Store the test data as numpy by pulling the values out of the pandas dataframe
data = X_test.values

In [6]:
# peak at the data if desired
X_test.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,81,82,83,84,85,86,87,88,89,90
200000,45.0954,-57.29138,20.05392,0.10414,12.00346,-14.31259,2.54547,-3.33986,-3.60175,-8.99414,...,7.15503,-81.48297,34.91722,15.11917,-6.09056,-78.52893,-46.87559,2.89571,64.25949,0.78578
200001,45.11673,-18.39958,-1.61552,-3.67929,-13.24027,-6.84375,-10.33838,-11.12891,16.56924,6.70243,...,8.26246,-138.26547,23.59451,60.99156,4.18891,105.75496,-126.39851,-3.80727,55.00532,-3.42354
200002,46.85191,9.44824,0.31518,-16.85413,15.42389,-15.82587,-21.21385,-10.16067,14.45113,2.22865,...,13.24783,-91.25475,41.123,55.22389,8.33048,7.57355,17.34516,0.21543,-58.3452,3.9276
200003,45.79644,-36.8623,21.9932,-10.4236,-2.8941,-8.8401,-23.73864,-9.82956,13.08399,0.57577,...,8.13403,-194.21155,55.55883,21.65387,6.41164,-47.18867,-212.1427,-4.6755,-86.99988,-6.12034
200004,40.92442,-43.26026,-18.721,-11.72495,-19.69395,-10.54229,-9.91945,-14.85633,9.37409,-0.93093,...,17.63167,-203.56276,27.10714,36.90795,0.62431,-40.23377,-83.22141,4.44391,73.15568,3.25023


#### Train the model  (Note, this may take a bit of time for larger values of _num_est_. )

In [7]:
from sklearn.ensemble import RandomForestRegressor
num_est=20 #originally 100 but it took too long

skl_model = RandomForestRegressor(n_estimators=num_est, max_depth=8)
skl_model.fit(X_train, y_train)

#### scikit-learn (CPU only)

In [None]:
skl_time = %timeit -o skl_model.predict(data)

#### Convert scikit-learn model to PyTorch

In [None]:
model = convert(skl_model, 'torch')

#### Time PyTorch - CPU

In [None]:
pred_cpu_hb = %timeit -o model.predict(data)

#### Switch PyTorch from CPU to GPU

In [None]:
%%capture 
model.to('cuda')

#### Time PyTorch - GPU

In [None]:
pred_gpu_hb = %timeit -o model.predict(data)

## Note: Continuing in this notebook requires TVM built with LLVM support. Install instructions [here](https://tvm.apache.org/docs/install/index.html)

#### Convert scikit-learn model to TVM (CPU)

In [None]:
model_tvm = convert(skl_model, 'tvm', data)

#### Time TVM - CPU

In [None]:
pred_cpu_tvm = %timeit -o model_tvm.predict(data)

#### Convert scikit-learn model to TVM (GPU)

In [None]:
model_tvm = convert(skl_model, 'tvm', data, 'cuda')

#### Time TVM - GPU

In [None]:
pred_gpu_tvm = %timeit -o model_tvm.predict(data)

#### Plot the results

In [None]:
def plot(title, skl_time, pred_cpu_hb, pred_gpu_hb, pred_cpu_tvm, pred_gpu_tvm):
    import matplotlib.pyplot as plt
    import numpy as np
    from matplotlib.pyplot import cm

    fig = plt.figure()

    x = ['skl','pyt-cpu','pyt-gpu','tvm-cpu','tvm-gpu']
    height = [skl_time.average,pred_cpu_hb.average,pred_gpu_hb.average,pred_cpu_tvm.average,pred_gpu_tvm.average]
    width = 1.0
    plt.ylabel('time in seconds')
    plt.xlabel(title)

    rects = plt.bar(x, height, width, color=cm.rainbow(np.linspace(0,1,5)))
    def autolabel(rects):

        for rect in rects:
            height = rect.get_height()
            plt.text(rect.get_x() + rect.get_width()/2., 1.05*height,
                    '%.4f' % (height),
                    ha='center', va='bottom')

    autolabel(rects)
    plt.show()

In [None]:
chartname = "SKL - " + str(num_est) + " - Year Dataset"

plot(chartname, skl_time, pred_cpu_hb, pred_gpu_hb, pred_cpu_tvm, pred_gpu_tvm)