### Data Mining and Machine Learning
### Ensembles of classifiers
#### Datasets:  Diabetes and Landsat
#### Edgar Acuna
#### April 2019

In [169]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init(ip="localhost", port=54323)
#h2o.no_progress()

Checking whether there is an H2O instance running at http://localhost:54323. connected.


0,1
H2O cluster uptime:,1 hour 9 mins
H2O cluster timezone:,America/La_Paz
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.2
H2O cluster version age:,2 months and 15 days
H2O cluster name:,H2O_from_python_edgar2017_frvj4w
H2O cluster total nodes:,1
H2O cluster free memory:,1.637 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [170]:
x=[5,3,12,13,21,31,8,9,15,17,24,32] 

In [171]:
boot1=np.random.choice(x,12)
print(boot1)

[32 13 17  9 32 24  5  8 12  3 12  5]


In [172]:
np.unique(boot1)

array([ 3,  5,  8,  9, 12, 13, 17, 24, 32])

In [173]:
boot2=np.random.choice(x,12)
print(boot2)

[12 12 21 12  3 32 31  5 24 32  9  3]


In [174]:
np.unique(boot2)

array([ 3,  5,  9, 12, 21, 24, 31, 32])

Note: Approximately 37% of the  instances of the training sample  L DO NOT appear in any bootstrap sample. In the above examples 16.67% and 41.67% of instances do not appear in each  of the bootstrap samples.

### I. Bagging for Diabetes using trees and scikit learn

In [175]:
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
#The response varaiable must be binary  (0,1)
y=data['class']-1
X=data.iloc[:,0:8]
modeltree = tree.DecisionTreeClassifier()
bagging = BaggingClassifier(modeltree,n_estimators=100)

In [177]:
# Accuracy rate by resubstitution
bagging.fit(X, y)
predictions = bagging.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       268

   micro avg       1.00      1.00      1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768



In [178]:
#Estimating the accuracy by cross validation
kfold = model_selection.KFold(n_splits=10, random_state=99)
results = model_selection.cross_val_score(bagging, X, y, cv=kfold)
print(results.mean())

0.7603896103896104


#### Out-of-Bag accuracy

In [179]:
bagging1 = BaggingClassifier(modeltree,n_estimators=50, oob_score=True)
bagging1.fit(X, y)
bagging1.oob_score_

0.7643229166666666

### II. AdaBoosting para Diabetes usando scikit-learn

In [180]:
adaboost = AdaBoostClassifier(modeltree,n_estimators=100,learning_rate=1)
adaboost.fit(X, y)
predictions = adaboost.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       268

   micro avg       1.00      1.00      1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768



In [181]:
#Estimating the accuracy by cross-validation
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(adaboost, X, y, cv=kfold)
print(results.mean())

0.6938311688311688


### III. Gradient Boosting para Diabetes usando scikit-learn

In [182]:
gboost = GradientBoostingClassifier(n_estimators=100)
#X_train, X_train_lr, y_train, y_train_lr = train_test_split(X,y,test_size=0.5)
gboost.fit(X, y)
predictions = gboost.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       500
           1       0.91      0.81      0.86       268

   micro avg       0.91      0.91      0.91       768
   macro avg       0.91      0.88      0.89       768
weighted avg       0.91      0.91      0.90       768



In [183]:
#Estimating the accueacy bt cross-validation
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(gboost, X, y, cv=kfold)
print(results.mean())

0.7669002050580999


### IV  Gradient Boosting for diabetes using h2o

In [184]:
diabetes = h2o.import_file("https://academic.uprm.edu/eacuna/diabetes.dat")
myx=['C1','C2','C3','C4','C5','C6','C7','C8']
diabetes['C9']=diabetes['C9'].asfactor()
myy="C9"
gbm1 = H2OGradientBoostingEstimator(model_id="gbm_covType_v1",ntrees = 100, max_depth=4,nfolds=10, sample_rate = 1,col_sample_rate = 1,seed=20000)
gbm1.train(myx, myy, training_frame=diabetes)
y_pred=gbm1.predict(diabetes)
print((y_pred['predict']==diabetes['C9']).mean())

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
[0.9388020833333334]


In [185]:
#Accuracy ny resubstitution
gbm1.model_performance(diabetes)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.06189488461506235
RMSE: 0.24878682564609877
LogLoss: 0.2278218904572922
Mean Per-Class Error: 0.06844776119402984
AUC: 0.9836082089552238
pr_auc: 0.9663482460483861
Gini: 0.9672164179104477
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4582730505624667: 


0,1,2,3,4
,1.0,2.0,Error,Rate
1,482.0,18.0,0.036,(18.0/500.0)
2,28.0,240.0,0.1045,(28.0/268.0)
Total,510.0,258.0,0.0599,(46.0/768.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4582731,0.9125475,171.0
max f2,0.3331835,0.9291396,206.0
max f0point5,0.5382824,0.9364407,148.0
max accuracy,0.4582731,0.9401042,171.0
max precision,0.9859219,1.0,0.0
max recall,0.1045607,1.0,309.0
max specificity,0.9859219,1.0,0.0
max absolute_mcc,0.4582731,0.8673905,171.0
max min_per_class_accuracy,0.4126285,0.9291045,185.0


Gains/Lift Table: Avg response rate: 34.90 %, avg score: 34.90 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0104167,0.9743136,2.8656716,2.8656716,1.0,0.9792077,1.0,0.9792077,0.0298507,0.0298507,186.5671642,186.5671642
,2,0.0208333,0.9648206,2.8656716,2.8656716,1.0,0.9699281,1.0,0.9745679,0.0298507,0.0597015,186.5671642,186.5671642
,3,0.03125,0.9559245,2.8656716,2.8656716,1.0,0.9600379,1.0,0.9697246,0.0298507,0.0895522,186.5671642,186.5671642
,4,0.0403646,0.9527817,2.8656716,2.8656716,1.0,0.9540077,1.0,0.9661756,0.0261194,0.1156716,186.5671642,186.5671642
,5,0.0507812,0.9411744,2.8656716,2.8656716,1.0,0.9461405,1.0,0.9620659,0.0298507,0.1455224,186.5671642,186.5671642
,6,0.1002604,0.8979212,2.8656716,2.8656716,1.0,0.9198390,1.0,0.9412266,0.1417910,0.2873134,186.5671642,186.5671642
,7,0.1510417,0.8357249,2.8656716,2.8656716,1.0,0.8691100,1.0,0.9169805,0.1455224,0.4328358,186.5671642,186.5671642
,8,0.2005208,0.7529427,2.8656716,2.8656716,1.0,0.7897352,1.0,0.8855823,0.1417910,0.5746269,186.5671642,186.5671642
,9,0.3007812,0.5329792,2.5679395,2.7664276,0.8961039,0.6480360,0.9653680,0.8064002,0.2574627,0.8320896,156.7939523,176.6427602







In [186]:
#Mostrando la matrix de confusion para estimar la precision out-of-bag y por validacion crizada
gbm1.confusion_matrix

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_covType_v1


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.06189488480582112
RMSE: 0.24878682602947674
LogLoss: 0.2278218932658851
Mean Per-Class Error: 0.06844776119402984
AUC: 0.9836082089552238
pr_auc: 0.9663482460483861
Gini: 0.9672164179104477
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4582730550290878: 


0,1,2,3,4
,1.0,2.0,Error,Rate
1,482.0,18.0,0.036,(18.0/500.0)
2,28.0,240.0,0.1045,(28.0/268.0)
Total,510.0,258.0,0.0599,(46.0/768.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4582731,0.9125475,171.0
max f2,0.3331834,0.9291396,206.0
max f0point5,0.5382824,0.9364407,148.0
max accuracy,0.4582731,0.9401042,171.0
max precision,0.9859219,1.0,0.0
max recall,0.1045607,1.0,309.0
max specificity,0.9859219,1.0,0.0
max absolute_mcc,0.4582731,0.8673905,171.0
max min_per_class_accuracy,0.4126285,0.9291045,185.0


Gains/Lift Table: Avg response rate: 34.90 %, avg score: 34.90 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0104167,0.9743136,2.8656716,2.8656716,1.0,0.9792077,1.0,0.9792077,0.0298507,0.0298507,186.5671642,186.5671642
,2,0.0208333,0.9648206,2.8656716,2.8656716,1.0,0.9699281,1.0,0.9745679,0.0298507,0.0597015,186.5671642,186.5671642
,3,0.03125,0.9559246,2.8656716,2.8656716,1.0,0.9600379,1.0,0.9697246,0.0298507,0.0895522,186.5671642,186.5671642
,4,0.0403646,0.9527818,2.8656716,2.8656716,1.0,0.9540077,1.0,0.9661756,0.0261194,0.1156716,186.5671642,186.5671642
,5,0.0507812,0.9411744,2.8656716,2.8656716,1.0,0.9461406,1.0,0.9620659,0.0298507,0.1455224,186.5671642,186.5671642
,6,0.1002604,0.8979212,2.8656716,2.8656716,1.0,0.9198390,1.0,0.9412267,0.1417910,0.2873134,186.5671642,186.5671642
,7,0.1510417,0.8357249,2.8656716,2.8656716,1.0,0.8691100,1.0,0.9169805,0.1455224,0.4328358,186.5671642,186.5671642
,8,0.2005208,0.7529426,2.8656716,2.8656716,1.0,0.7897353,1.0,0.8855823,0.1417910,0.5746269,186.5671642,186.5671642
,9,0.3007812,0.5329792,2.5679395,2.7664276,0.8961039,0.6480360,0.9653680,0.8064002,0.2574627,0.8320896,156.7939523,176.6427602




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.17092323346297955
RMSE: 0.4134286316439387
LogLoss: 0.5301717811615181
Mean Per-Class Error: 0.25068656716417914
AUC: 0.8136343283582089
pr_auc: 0.6697367443180624
Gini: 0.6272686567164178
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.20111566420593152: 


0,1,2,3,4
,1.0,2.0,Error,Rate
1,337.0,163.0,0.326,(163.0/500.0)
2,47.0,221.0,0.1754,(47.0/268.0)
Total,384.0,384.0,0.2734,(210.0/768.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2011157,0.6779141,250.0
max f2,0.0826910,0.7848655,321.0
max f0point5,0.5894900,0.6644981,129.0
max accuracy,0.5894900,0.7604167,129.0
max precision,0.9876169,1.0,0.0
max recall,0.0070105,1.0,395.0
max specificity,0.9876169,1.0,0.0
max absolute_mcc,0.2011157,0.4753318,250.0
max min_per_class_accuracy,0.3133239,0.738,212.0


Gains/Lift Table: Avg response rate: 34.90 %, avg score: 34.20 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0104167,0.9748094,2.1492537,2.1492537,0.75,0.9816734,0.75,0.9816734,0.0223881,0.0223881,114.9253731,114.9253731
,2,0.0208333,0.9616998,2.8656716,2.5074627,1.0,0.9700813,0.875,0.9758774,0.0298507,0.0522388,186.5671642,150.7462687
,3,0.03125,0.9497090,1.4328358,2.1492537,0.5,0.9560591,0.75,0.9692713,0.0149254,0.0671642,43.2835821,114.9253731
,4,0.0403646,0.9396267,1.6375267,2.0337025,0.5714286,0.9449205,0.7096774,0.9637727,0.0149254,0.0820896,63.7526652,103.3702455
,5,0.0507812,0.9300895,2.1492537,2.0574053,0.75,0.9348179,0.7179487,0.9578333,0.0223881,0.1044776,114.9253731,105.7405281
,6,0.1002604,0.8754006,2.4131972,2.2329909,0.8421053,0.9055971,0.7792208,0.9320544,0.1194030,0.2238806,141.3197172,123.2990890
,7,0.1510417,0.7979800,2.0574053,2.1739578,0.7179487,0.8340616,0.7586207,0.8991085,0.1044776,0.3283582,105.7405281,117.3957797
,8,0.2005208,0.7022921,1.8853103,2.1027331,0.6578947,0.7483644,0.7337662,0.8619119,0.0932836,0.4216418,88.5310291,110.2733088
,9,0.3007812,0.5315935,1.6003101,1.9352588,0.5584416,0.6068712,0.6753247,0.7768983,0.1604478,0.5820896,60.0310138,93.5258771



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7,8,9,10,11,12
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.7527147,0.0407522,0.7051282,0.7857143,0.7804878,0.7468355,0.7402598,0.835443,0.6944444,0.8205128,0.6375,0.7808219
auc,0.8144981,0.0306199,0.7694445,0.848,0.8275862,0.8136054,0.8092198,0.8690702,0.7517361,0.8559783,0.7438187,0.8565217
err,0.2472852,0.0407522,0.2948718,0.2142857,0.2195122,0.2531646,0.2597403,0.1645570,0.3055556,0.1794872,0.3625,0.2191781
err_count,19.0,3.2710855,23.0,15.0,18.0,20.0,20.0,13.0,22.0,14.0,29.0,16.0
f0point5,0.6436048,0.0441696,0.6182796,0.6832298,0.6804734,0.6578947,0.6553398,0.6194690,0.5625,0.7738095,0.5387931,0.6462585
f1,0.7003305,0.0342510,0.6666667,0.7457627,0.71875,0.7142857,0.7297297,0.6829268,0.6206896,0.7878788,0.6329114,0.7037037
f2,0.7705051,0.0273724,0.7232704,0.8208955,0.7615894,0.78125,0.8231707,0.7608696,0.6923077,0.8024691,0.7668711,0.7723577
lift_top_group,2.4539034,0.9525076,2.6,2.8,2.8275862,2.6333334,0.0,4.647059,3.0,0.0,2.857143,3.173913
logloss,0.5298222,0.0553850,0.6042044,0.4873038,0.4956354,0.5666018,0.5579652,0.3700579,0.5975564,0.5149542,0.6504122,0.4535302


Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2019-04-03 19:02:28,2.357 sec,0.0,0.4766408,0.6467994,0.5,0.0,1.0,0.6510417
,2019-04-03 19:02:28,2.360 sec,1.0,0.4583666,0.6094445,0.8705560,0.5840419,2.6268657,0.21875
,2019-04-03 19:02:28,2.363 sec,2.0,0.4430368,0.5792476,0.8758731,0.6118444,2.7034638,0.2135417
,2019-04-03 19:02:28,2.365 sec,3.0,0.4300693,0.5542099,0.8826418,0.6160802,2.7034638,0.2083333
,2019-04-03 19:02:28,2.367 sec,4.0,0.4193521,0.5337535,0.8839366,0.6425409,2.8005427,0.2057292
---,---,---,---,---,---,---,---,---,---
,2019-04-03 19:02:28,2.689 sec,96.0,0.2537534,0.2344594,0.9816940,0.9630855,2.8656716,0.0651042
,2019-04-03 19:02:28,2.693 sec,97.0,0.2533385,0.2337444,0.9817873,0.9632840,2.8656716,0.0651042
,2019-04-03 19:02:28,2.697 sec,98.0,0.2523625,0.2323599,0.9823284,0.9641129,2.8656716,0.0638021



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
C2,253.0619049,1.0,0.4011016
C6,133.0974426,0.5259482,0.2109587
C8,78.7900696,0.3113470,0.1248818
C7,65.5236588,0.2589234,0.1038546
C1,35.4766197,0.1401895,0.0562302
C5,30.2431660,0.1195090,0.0479352
C3,22.3595104,0.0883559,0.0354397
C4,12.3648167,0.0488608,0.0195982


<bound method H2OBinomialModel.confusion_matrix of >

### V. Bagging  using Decision Trees for Landsat (scikit-learn)

In [190]:
url='http://academic.uprm.edu/eacuna/landsat.txt'
data = pd.read_table(url, header=None,delim_whitespace=True)
y=data.iloc[:,36]-1
names=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13',
            'C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27',
           'C28','C29', 'C30','C31','C32','C33','C34','C35','C36','C37']
X=data.iloc[:,0:36]
modeltree = tree.DecisionTreeClassifier()
bagging = BaggingClassifier(modeltree,n_estimators=100, max_features=1.0)
# Tasa de precision
bagging.fit(X, y)
predictions = bagging.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00       479
           2       1.00      1.00      1.00       961
           3       1.00      1.00      1.00       415
           4       1.00      1.00      1.00       470
           5       1.00      1.00      1.00      1038

   micro avg       1.00      1.00      1.00      4435
   macro avg       1.00      1.00      1.00      4435
weighted avg       1.00      1.00      1.00      4435



In [191]:
#Accuracy by resubstitution
kfold = model_selection.KFold(n_splits=10, random_state=99)
results = model_selection.cross_val_score(modeltree, X, y, cv=kfold)
print(results.mean())

0.7979826327456124


In [192]:
#accuracy by out-of-bag
bagging1 = BaggingClassifier(modeltree,n_estimators=50, oob_score=True)
bagging1.fit(X, y)
bagging1.oob_score_

0.8980834272829763

### VI. AdaBoosting for Landsat

In [193]:
adaboost = AdaBoostClassifier(modeltree,n_estimators=100,learning_rate=1)
adaboost.fit(X, y)
predictions = adaboost.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00       479
           2       1.00      1.00      1.00       961
           3       1.00      1.00      1.00       415
           4       1.00      1.00      1.00       470
           5       1.00      1.00      1.00      1038

   micro avg       1.00      1.00      1.00      4435
   macro avg       1.00      1.00      1.00      4435
weighted avg       1.00      1.00      1.00      4435



In [194]:
#accuracy by cross-validation
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(adaboost, X, y, cv=kfold)
print(results.mean())

0.8045258576861285


In [195]:
gboost = GradientBoostingClassifier(n_estimators=100)
#X_train, X_train_lr, y_train, y_train_lr = train_test_split(X,y,test_size=0.5)
gboost.fit(X, y)
predictions = gboost.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00       479
           2       0.96      0.99      0.98       961
           3       0.97      0.87      0.91       415
           4       0.99      0.99      0.99       470
           5       0.97      0.98      0.98      1038

   micro avg       0.98      0.98      0.98      4435
   macro avg       0.98      0.97      0.98      4435
weighted avg       0.98      0.98      0.98      4435



In [196]:
#Estimating the accueacy bt cross-validation
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(gboost, X, y, cv=kfold)
print(results.mean())

0.8645084700953776


### VII. Gradient Boostimg for Landsat using H2o

In [200]:
#Leyendo los datos
datos= h2o.import_file("http://academic.uprm.edu/eacuna/landsat.txt")
myx=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13',
            'C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27',
           'C28','C29', 'C30','C31','C32','C33','C34','C35','C36']
datos['C37']=datos['C37'].asfactor()
myy="C37"
gbm2 = H2OGradientBoostingEstimator(model_id="gbm_covType_v1",ntrees = 100, max_depth=4,nfolds=10, sample_rate = 1,col_sample_rate = 1,seed=20000)
gbm2.train(myx, myy, training_frame=datos)
y_pred=gbm2.predict(datos)
print((y_pred['predict']==datos['C37']).mean())

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
[0.9950394588500564]


In [201]:
#Mostrando la matrix de confusion para estimar la precision out-of-bag y por validacion crizada
gbm2.confusion_matrix

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_covType_v1


ModelMetricsMultinomial: gbm
** Reported on train data. **

MSE: 0.00798614637609013
RMSE: 0.08936524143138724
LogLoss: 0.044830497516567674
Mean Per-Class Error: 0.007389093948046522
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7
1.0,2.0,3.0,4.0,5.0,6.0,Error,Rate
1072.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 1,072"
0.0,479.0,0.0,0.0,0.0,0.0,0.0,0 / 479
0.0,0.0,961.0,0.0,0.0,0.0,0.0,0 / 961
0.0,0.0,7.0,399.0,0.0,9.0,0.0385542,16 / 415
0.0,0.0,0.0,0.0,470.0,0.0,0.0,0 / 470
0.0,0.0,4.0,2.0,0.0,1032.0,0.0057803,"6 / 1,038"
1072.0,479.0,972.0,401.0,470.0,1041.0,0.0049605,"22 / 4,435"


Top-6 Hit Ratios: 


0,1
k,hit_ratio
1,0.9950395
2,0.9995490
3,1.0
4,1.0
5,1.0
6,1.0



ModelMetricsMultinomial: gbm
** Reported on cross-validation data. **

MSE: 0.0720374623883169
RMSE: 0.26839795526105803
LogLoss: 0.242699011096459
Mean Per-Class Error: 0.11790647672150929
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7
1.0,2.0,3.0,4.0,5.0,6.0,Error,Rate
1046.0,2.0,12.0,2.0,9.0,1.0,0.0242537,"26 / 1,072"
0.0,461.0,2.0,5.0,9.0,2.0,0.0375783,18 / 479
5.0,1.0,917.0,27.0,0.0,11.0,0.0457856,44 / 961
2.0,7.0,73.0,260.0,2.0,71.0,0.3734940,155 / 415
24.0,5.0,1.0,5.0,408.0,27.0,0.1319149,62 / 470
0.0,1.0,20.0,56.0,21.0,940.0,0.0944123,"98 / 1,038"
1077.0,477.0,1025.0,355.0,449.0,1052.0,0.0908681,"403 / 4,435"


Top-6 Hit Ratios: 


0,1
k,hit_ratio
1,0.9091319
2,0.98354
3,0.9975197
4,0.9990981
5,0.9995490
6,1.0


Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7,8,9,10,11,12
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.9091753,0.0051036,0.8972602,0.913486,0.9212410,0.908686,0.9080718,0.9177215,0.9076213,0.9092873,0.8975501,0.9108281
err,0.0908247,0.0051036,0.1027397,0.0865140,0.0787589,0.0913140,0.0919283,0.0822785,0.0923787,0.0907127,0.1024499,0.0891720
err_count,40.3,2.793743,45.0,34.0,33.0,41.0,41.0,39.0,40.0,42.0,46.0,42.0
logloss,0.2423220,0.0238197,0.2634830,0.2158724,0.2090010,0.2943289,0.2078292,0.2147693,0.2419506,0.2922446,0.2730433,0.2106980
max_per_class_error,0.3661962,0.0522200,0.4313726,0.2352941,0.3529412,0.3513514,0.275,0.3658537,0.4042553,0.3170732,0.4390244,0.4897959
mean_per_class_accuracy,0.8830432,0.0074455,0.8743715,0.9011727,0.886739,0.8807260,0.8897406,0.8878265,0.8814279,0.8893574,0.8592726,0.8797972
mean_per_class_error,0.1169569,0.0074455,0.1256285,0.0988273,0.1132610,0.1192739,0.1102594,0.1121735,0.1185721,0.1106426,0.1407274,0.1202028
mse,0.0719436,0.0052574,0.0803244,0.0664981,0.0621767,0.0826347,0.0663168,0.0660817,0.0687912,0.0801028,0.0802278,0.0662818
r2,0.9795344,0.0016480,0.9776692,0.9812176,0.9831359,0.9775671,0.9810706,0.9803757,0.9801887,0.9763857,0.9759634,0.9817706


Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error
,2019-04-03 19:11:40,26.873 sec,0.0,0.8333333,1.7917595,0.8063134
,2019-04-03 19:11:40,26.899 sec,1.0,0.7615863,1.4415167,0.1278467
,2019-04-03 19:11:40,26.923 sec,2.0,0.6986460,1.2151781,0.1149944
,2019-04-03 19:11:40,26.946 sec,3.0,0.6423963,1.0488657,0.1113867
,2019-04-03 19:11:40,26.968 sec,4.0,0.5920875,0.9191481,0.1059752
---,---,---,---,---,---,---
,2019-04-03 19:11:42,29.138 sec,96.0,0.0939514,0.0479037,0.0058625
,2019-04-03 19:11:42,29.161 sec,97.0,0.0926412,0.0470240,0.0056370
,2019-04-03 19:11:42,29.188 sec,98.0,0.0914996,0.0462301,0.0054115



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
C17,2636.6804199,1.0,0.1909320
C22,1897.1207275,0.7195111,0.1373777
C20,1485.0837402,0.5632399,0.1075405
C18,1168.8540039,0.4433051,0.0846411
C34,1040.6865234,0.3946957,0.0753600
---,---,---,---
C8,63.4294167,0.0240565,0.0045932
C1,54.3748970,0.0206225,0.0039375
C11,53.4609451,0.0202759,0.0038713



See the whole table with table.as_data_frame()


<bound method H2OMultinomialModel.confusion_matrix of >