In [1]:
# Example to show: 
#  - Insert Iris dataset as a table in the OmniSci Cloud instance using Python API.
#  - Download the table into a DataFrame 
#  - Use XGBoost library to use the DataFrame to do predictions

import pandas as pd
import numpy as np
import pymapd
from sklearn import datasets
from pprint import pprint
PWD = !pwd

## Load dataset into MapD database

In [2]:
iris = datasets.load_iris()
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0


In [3]:
# connect mapd
dbname    = 'mapd'
# Your OmniSci Cloud API Keys below
username  = 'J451D701BAAC64258A6E'
password  = '7SoEKT8051KisknbzmGrVOhnxscYk34tApeCIQnO'
hostname  = 'use2-api.omnisci.cloud'
mport     = 443
protocol = 'https'

con = pymapd.connect(user=username,
                password=password,
                host=hostname,
                dbname=dbname,
                port=mport,
                protocol=protocol)
print(con)

Connection(mapd://J451D701BAAC64258A6E:***@https://use2-api.omnisci.cloud:443/mapd?protocol=https)


In [4]:
table = 'iris'
_create = '''CREATE TABLE {}(sepal_length double, sepal_width double, petal_length double, petal_width double, target int)'''.format(table)
_drop = '''DROP TABLE IF EXISTS {}'''.format(table)
con.execute(_drop)
con.execute(_create)
# load data
con.load_table(table, df.itertuples(index=False))

## MapD to Pygdf

In [5]:
query = '''Select * from {}'''.format(table)
print(query)
tempdf = con.execute(query)
#df = con.select_ipc(query)
if tempdf.rowcount > 0:
    templist = list(tempdf)
    df = pd.DataFrame(templist, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'])

print(df.head(5))
df.describe()

Select * from iris
   sepal_length  sepal_width  petal_length  petal_width  target
0           5.2          4.1           1.5          0.1       0
1           5.5          4.2           1.4          0.2       0
2           4.9          3.1           1.5          0.2       0
3           5.0          3.2           1.2          0.2       0
4           5.3          3.7           1.5          0.2       0


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [6]:
# separate labels
df_y = df['target']
del df['target']

## XGBoost

### Training

In [7]:
import xgboost as xgb

In [8]:
# gpu to DMatrices
dtrain = xgb.DMatrix(df, label=df_y)

In [9]:
params = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations
ear_stop = 20
cv_folds = 5

In [None]:
# cross validation
cvresult = xgb.cv(
            params=params,
            dtrain=dtrain,
            num_boost_round=num_round,
            nfold=cv_folds,
            early_stopping_rounds=ear_stop,
            show_stdv=True)

cvtest = cvresult['test-merror-mean'].min()
cvtrain = cvresult['train-merror-mean'].min()
cvtestsd = cvresult['test-merror-std'].min()
cvtrainsd = cvresult['train-merror-std'].min()

print('CV train loss mean: {}'.format(cvtrain))
print('CV train loss std: {}'.format(cvtrainsd))
print('CV test loss mean: {}'.format(cvtest))
print('CV test loss std: {}'.format(cvtestsd))

# Train
mod = xgb.train(params, dtrain, num_round)

CV train loss mean: 0.0
CV train loss std: 0.0
CV test loss mean: 0.0333332
CV test loss std: 0.021081956478467553


In [None]:
from xgboost import plot_importance
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize']= [15,12]
matplotlib.rcParams['figure.dpi']= 55
plot_importance(mod)
plt.show()

### Prediction

In [None]:
pred = mod.predict(dtrain)
pred_val = np.asarray([np.argmax(line) for line in pred])
pred_val