### Data Mining and Machine Learning
### Ensembles of classifiers
#### Datasets:  Diabetes and Landsat
#### Edgar Acuna
#### April 2020

In [4]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init(ip="localhost", port=54323)
#h2o.no_progress()

Checking whether there is an H2O instance running at http://localhost:54323 . connected.


0,1
H2O cluster uptime:,2 mins 45 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.27.0.4835
H2O cluster version age:,5 months and 8 days !!!
H2O cluster name:,H2O_from_python_eacun_msgzj3
H2O cluster total nodes:,1
H2O cluster free memory:,3.518 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


### Bootstrap Samples

In [5]:
#This is the orginal training sample L 
x=[5,3,12,13,21,31,8,9,15,17,24,32] 

In [6]:
#This is bootsrap sample(sample with replacement)
boot1=np.random.choice(x,12)
print(boot1)

[17 32 15 32 12  5 17 15 31 17  5  5]


In [7]:
np.unique(boot1)

array([ 5, 12, 15, 17, 31, 32])

In [8]:
#another boostrapp sample
boot2=np.random.choice(x,12)
print(boot2)

[15  9  3 21  5  5 15 15 15  9 15 17]


In [6]:
np.unique(boot2)

array([ 3,  8,  9, 12, 13, 15, 17, 21, 24, 32])

Note: Approximately 37% of the  instances of the training sample  L DO NOT appear in any bootstrap sample. In the above examples 16.67% and 41.67% of instances do not appear in each  of the bootstrap samples.

### I. Bagging for Diabetes using trees and scikit learn

In [9]:
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
#The response varaiable must be binary  (0,1)
y=data['class']-1
X=data.iloc[:,0:8]
modeltree = tree.DecisionTreeClassifier()
bagging = BaggingClassifier(modeltree,n_estimators=100)

In [10]:
# Accuracy rate by resubstitution
bagging.fit(X, y)
predictions = bagging.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       268

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768



In [11]:
#Estimating the accuracy by cross validation
kfold = model_selection.KFold(n_splits=10, random_state=99)
results = model_selection.cross_val_score(bagging, X, y, cv=kfold)
print(results.mean())

0.7603383458646615


#### Out-of-Bag accuracy

In [12]:
bagging1 = BaggingClassifier(modeltree,n_estimators=50, oob_score=True)
bagging1.fit(X, y)
bagging1.oob_score_

0.7486979166666666

### II. AdaBoosting para Diabetes usando scikit-learn

In [13]:
adaboost = AdaBoostClassifier(modeltree,n_estimators=100,learning_rate=1)
adaboost.fit(X, y)
predictions = adaboost.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       500
           1       1.00      1.00      1.00       268

    accuracy                           1.00       768
   macro avg       1.00      1.00      1.00       768
weighted avg       1.00      1.00      1.00       768



In [14]:
#Estimating the accuracy by cross-validation
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(adaboost, X, y, cv=kfold)
print(results.mean())

0.6991285030758715


### III. Gradient Boosting para Diabetes usando scikit-learn

In [16]:
gboost = GradientBoostingClassifier(n_estimators=100)
#X_train, X_train_lr, y_train, y_train_lr = train_test_split(X,y,test_size=0.5)
gboost.fit(X, y)
predictions = gboost.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       500
           1       0.91      0.81      0.86       268

    accuracy                           0.91       768
   macro avg       0.91      0.88      0.89       768
weighted avg       0.91      0.91      0.90       768



In [17]:
#Estimating the accuracy by cross-validation
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(gboost, X, y, cv=kfold)
print(results.mean())

0.7669002050580999


### IV  Gradient Boosting for diabetes using h2o

In [19]:
diabetes = h2o.import_file("https://academic.uprm.edu/eacuna/diabetes.dat")
myx=['C1','C2','C3','C4','C5','C6','C7','C8']
diabetes['C9']=diabetes['C9'].asfactor()
myy="C9"
gbm1 = H2OGradientBoostingEstimator(model_id="gbm_covType_v1",ntrees = 100, max_depth=4,nfolds=10, sample_rate = 1,col_sample_rate = 1,seed=20000)
gbm1.train(myx, myy, training_frame=diabetes)
y_pred=gbm1.predict(diabetes)
print((y_pred['predict']==diabetes['C9']).mean())

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
[0.9388020833333334]


In [20]:
#Accuracy ny resubstitution
gbm1.model_performance(diabetes)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.06189488461506235
RMSE: 0.24878682564609877
LogLoss: 0.2278218904572922
Mean Per-Class Error: 0.06844776119402984
AUC: 0.9836082089552238
pr_auc: 0.9663482460483861
Gini: 0.9672164179104477

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4582730505624667: 


Unnamed: 0,Unnamed: 1,1,2,Error,Rate
0,1,482.0,18.0,0.036,(18.0/500.0)
1,2,28.0,240.0,0.1045,(28.0/268.0)
2,Total,510.0,258.0,0.0599,(46.0/768.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.458273,0.912548,171.0
1,max f2,0.333183,0.92914,206.0
2,max f0point5,0.538282,0.936441,148.0
3,max accuracy,0.458273,0.940104,171.0
4,max precision,0.985922,1.0,0.0
5,max recall,0.104561,1.0,309.0
6,max specificity,0.985922,1.0,0.0
7,max absolute_mcc,0.458273,0.86739,171.0
8,max min_per_class_accuracy,0.412628,0.929104,185.0
9,max mean_per_class_accuracy,0.412628,0.931552,185.0



Gains/Lift Table: Avg response rate: 34.90 %, avg score: 34.90 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010417,0.974314,2.865672,2.865672,1.0,0.979208,1.0,0.979208,0.029851,0.029851,186.567164,186.567164
1,,2,0.020833,0.964821,2.865672,2.865672,1.0,0.969928,1.0,0.974568,0.029851,0.059701,186.567164,186.567164
2,,3,0.03125,0.955925,2.865672,2.865672,1.0,0.960038,1.0,0.969725,0.029851,0.089552,186.567164,186.567164
3,,4,0.040365,0.952782,2.865672,2.865672,1.0,0.954008,1.0,0.966176,0.026119,0.115672,186.567164,186.567164
4,,5,0.050781,0.941174,2.865672,2.865672,1.0,0.946141,1.0,0.962066,0.029851,0.145522,186.567164,186.567164
5,,6,0.10026,0.897921,2.865672,2.865672,1.0,0.919839,1.0,0.941227,0.141791,0.287313,186.567164,186.567164
6,,7,0.151042,0.835725,2.865672,2.865672,1.0,0.86911,1.0,0.916981,0.145522,0.432836,186.567164,186.567164
7,,8,0.200521,0.752943,2.865672,2.865672,1.0,0.789735,1.0,0.885582,0.141791,0.574627,186.567164,186.567164
8,,9,0.300781,0.532979,2.56794,2.766428,0.896104,0.648036,0.965368,0.8064,0.257463,0.83209,156.793952,176.64276
9,,10,0.39974,0.345884,1.206599,2.380281,0.421053,0.436178,0.830619,0.714749,0.119403,0.951493,20.659859,138.028101







In [22]:
#Mostrando la matrix de confusion para estimar la precision out-of-bag y por validacion crizada
gbm1.confusion_matrix

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_covType_v1


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,100.0,100.0,19367.0,4.0,4.0,4.0,5.0,16.0,10.74




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.06189488480582112
RMSE: 0.24878682602947674
LogLoss: 0.2278218932658851
Mean Per-Class Error: 0.06844776119402984
AUC: 0.9836082089552238
pr_auc: 0.9663482460483861
Gini: 0.9672164179104477

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4582730550290878: 


Unnamed: 0,Unnamed: 1,1,2,Error,Rate
0,1,482.0,18.0,0.036,(18.0/500.0)
1,2,28.0,240.0,0.1045,(28.0/268.0)
2,Total,510.0,258.0,0.0599,(46.0/768.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.458273,0.912548,171.0
1,max f2,0.333183,0.92914,206.0
2,max f0point5,0.538282,0.936441,148.0
3,max accuracy,0.458273,0.940104,171.0
4,max precision,0.985922,1.0,0.0
5,max recall,0.104561,1.0,309.0
6,max specificity,0.985922,1.0,0.0
7,max absolute_mcc,0.458273,0.86739,171.0
8,max min_per_class_accuracy,0.412628,0.929104,185.0
9,max mean_per_class_accuracy,0.412628,0.931552,185.0



Gains/Lift Table: Avg response rate: 34.90 %, avg score: 34.90 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010417,0.974314,2.865672,2.865672,1.0,0.979208,1.0,0.979208,0.029851,0.029851,186.567164,186.567164
1,,2,0.020833,0.964821,2.865672,2.865672,1.0,0.969928,1.0,0.974568,0.029851,0.059701,186.567164,186.567164
2,,3,0.03125,0.955925,2.865672,2.865672,1.0,0.960038,1.0,0.969725,0.029851,0.089552,186.567164,186.567164
3,,4,0.040365,0.952782,2.865672,2.865672,1.0,0.954008,1.0,0.966176,0.026119,0.115672,186.567164,186.567164
4,,5,0.050781,0.941174,2.865672,2.865672,1.0,0.946141,1.0,0.962066,0.029851,0.145522,186.567164,186.567164
5,,6,0.10026,0.897921,2.865672,2.865672,1.0,0.919839,1.0,0.941227,0.141791,0.287313,186.567164,186.567164
6,,7,0.151042,0.835725,2.865672,2.865672,1.0,0.86911,1.0,0.916981,0.145522,0.432836,186.567164,186.567164
7,,8,0.200521,0.752943,2.865672,2.865672,1.0,0.789735,1.0,0.885582,0.141791,0.574627,186.567164,186.567164
8,,9,0.300781,0.532979,2.56794,2.766428,0.896104,0.648036,0.965368,0.8064,0.257463,0.83209,156.793952,176.64276
9,,10,0.39974,0.345884,1.206599,2.380281,0.421053,0.436178,0.830619,0.714749,0.119403,0.951493,20.659859,138.028101




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.17092323346297955
RMSE: 0.4134286316439387
LogLoss: 0.5301717811615181
Mean Per-Class Error: 0.25068656716417914
AUC: 0.8136343283582089
pr_auc: 0.6697367443180624
Gini: 0.6272686567164178

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.20111566420593152: 


Unnamed: 0,Unnamed: 1,1,2,Error,Rate
0,1,337.0,163.0,0.326,(163.0/500.0)
1,2,47.0,221.0,0.1754,(47.0/268.0)
2,Total,384.0,384.0,0.2734,(210.0/768.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.201116,0.677914,250.0
1,max f2,0.082691,0.784866,321.0
2,max f0point5,0.58949,0.664498,129.0
3,max accuracy,0.58949,0.760417,129.0
4,max precision,0.987617,1.0,0.0
5,max recall,0.00701,1.0,395.0
6,max specificity,0.987617,1.0,0.0
7,max absolute_mcc,0.201116,0.475332,250.0
8,max min_per_class_accuracy,0.313324,0.738,212.0
9,max mean_per_class_accuracy,0.201116,0.749313,250.0



Gains/Lift Table: Avg response rate: 34.90 %, avg score: 34.20 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010417,0.974809,2.149254,2.149254,0.75,0.981673,0.75,0.981673,0.022388,0.022388,114.925373,114.925373
1,,2,0.020833,0.9617,2.865672,2.507463,1.0,0.970081,0.875,0.975877,0.029851,0.052239,186.567164,150.746269
2,,3,0.03125,0.949709,1.432836,2.149254,0.5,0.956059,0.75,0.969271,0.014925,0.067164,43.283582,114.925373
3,,4,0.040365,0.939627,1.637527,2.033702,0.571429,0.94492,0.709677,0.963773,0.014925,0.08209,63.752665,103.370246
4,,5,0.050781,0.93009,2.149254,2.057405,0.75,0.934818,0.717949,0.957833,0.022388,0.104478,114.925373,105.740528
5,,6,0.10026,0.875401,2.413197,2.232991,0.842105,0.905597,0.779221,0.932054,0.119403,0.223881,141.319717,123.299089
6,,7,0.151042,0.79798,2.057405,2.173958,0.717949,0.834062,0.758621,0.899109,0.104478,0.328358,105.740528,117.39578
7,,8,0.200521,0.702292,1.88531,2.102733,0.657895,0.748364,0.733766,0.861912,0.093284,0.421642,88.531029,110.273309
8,,9,0.300781,0.531594,1.60031,1.935259,0.558442,0.606871,0.675325,0.776898,0.160448,0.58209,60.031014,93.525877
9,,10,0.39974,0.357655,1.282011,1.773543,0.447368,0.446075,0.618893,0.695001,0.126866,0.708955,28.2011,77.354271




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
0,accuracy,0.75271475,0.060749862,0.7051282,0.78571427,0.7804878,0.7468355,0.74025977,0.835443,0.6944444,0.82051283,0.6375,0.7808219
1,auc,0.81449807,0.045645393,0.76944447,0.848,0.82758623,0.8136054,0.80921984,0.86907023,0.7517361,0.85597825,0.7438187,0.8565217
2,err,0.24728523,0.060749862,0.2948718,0.21428572,0.2195122,0.25316456,0.25974026,0.16455697,0.30555555,0.17948718,0.3625,0.21917808
3,err_count,19.0,4.8762465,23.0,15.0,18.0,20.0,20.0,13.0,22.0,14.0,29.0,16.0
4,f0point5,0.64360476,0.065844126,0.6182796,0.6832298,0.6804734,0.65789473,0.6553398,0.61946905,0.5625,0.77380955,0.5387931,0.64625853
5,f1,0.7003305,0.051058408,0.6666667,0.7457627,0.71875,0.71428573,0.7297297,0.68292683,0.62068963,0.7878788,0.6329114,0.7037037
6,f2,0.77050513,0.040804323,0.7232704,0.8208955,0.7615894,0.78125,0.8231707,0.76086956,0.6923077,0.80246913,0.76687115,0.7723577
7,lift_top_group,2.4539034,1.4199145,2.6,2.8,2.8275862,2.6333334,0.0,4.647059,3.0,0.0,2.857143,3.173913
8,logloss,0.5298222,0.082563154,0.6042044,0.48730385,0.49563545,0.5666018,0.5579652,0.37005785,0.5975564,0.5149542,0.6504122,0.4535302
9,max_per_class_error,0.29315436,0.09640185,0.33333334,0.26666668,0.2264151,0.30612245,0.3617021,0.1764706,0.33333334,0.1875,0.5,0.24



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-04-06 16:20:30,6.466 sec,0.0,0.476641,0.646799,0.5,0.0,1.0,0.651042
1,,2020-04-06 16:20:30,6.484 sec,1.0,0.458367,0.609445,0.870556,0.584042,2.626866,0.21875
2,,2020-04-06 16:20:30,6.492 sec,2.0,0.443037,0.579248,0.875873,0.611844,2.703464,0.213542
3,,2020-04-06 16:20:30,6.504 sec,3.0,0.430069,0.55421,0.882642,0.61608,2.703464,0.208333
4,,2020-04-06 16:20:30,6.508 sec,4.0,0.419352,0.533754,0.883937,0.642541,2.800543,0.205729
5,,2020-04-06 16:20:30,6.516 sec,5.0,0.409897,0.515609,0.886638,0.682676,2.865672,0.217448
6,,2020-04-06 16:20:30,6.524 sec,6.0,0.400978,0.498875,0.894974,0.720901,2.865672,0.1875
7,,2020-04-06 16:20:30,6.533 sec,7.0,0.393181,0.484229,0.900034,0.730095,2.865672,0.184896
8,,2020-04-06 16:20:30,6.538 sec,8.0,0.38614,0.470769,0.903716,0.745706,2.865672,0.1875
9,,2020-04-06 16:20:30,6.545 sec,9.0,0.37984,0.458879,0.906813,0.770021,2.865672,0.171875



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,C2,253.061905,1.0,0.401102
1,C6,133.097443,0.525948,0.210959
2,C8,78.79007,0.311347,0.124882
3,C7,65.523659,0.258923,0.103855
4,C1,35.47662,0.140189,0.05623
5,C5,30.243166,0.119509,0.047935
6,C3,22.35951,0.088356,0.03544
7,C4,12.364817,0.048861,0.019598


<bound method H2OBinomialModel.confusion_matrix of >

### V. Bagging  using Decision Trees for Landsat (scikit-learn)

In [21]:
url='http://academic.uprm.edu/eacuna/landsat.txt'
data = pd.read_table(url, header=None,delim_whitespace=True)
y=data.iloc[:,36]-1
names=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13',
            'C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27',
           'C28','C29', 'C30','C31','C32','C33','C34','C35','C36','C37']
X=data.iloc[:,0:36]
modeltree = tree.DecisionTreeClassifier()
bagging = BaggingClassifier(modeltree,n_estimators=100, max_features=1.0)
# Tasa de precision
bagging.fit(X, y)
predictions = bagging.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00       479
           2       1.00      1.00      1.00       961
           3       1.00      1.00      1.00       415
           4       1.00      1.00      1.00       470
           5       1.00      1.00      1.00      1038

    accuracy                           1.00      4435
   macro avg       1.00      1.00      1.00      4435
weighted avg       1.00      1.00      1.00      4435



In [19]:
#Accuracy by resubstitution
kfold = model_selection.KFold(n_splits=10, random_state=99)
results = model_selection.cross_val_score(modeltree, X, y, cv=kfold)
print(results.mean())

0.8027159213389462


In [20]:
#accuracy by out-of-bag
bagging1 = BaggingClassifier(modeltree,n_estimators=50, oob_score=True)
bagging1.fit(X, y)
bagging1.oob_score_

0.9028184892897407

### VI. AdaBoosting for Landsat

In [21]:
adaboost = AdaBoostClassifier(modeltree,n_estimators=100,learning_rate=1)
adaboost.fit(X, y)
predictions = adaboost.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00       479
           2       1.00      1.00      1.00       961
           3       1.00      1.00      1.00       415
           4       1.00      1.00      1.00       470
           5       1.00      1.00      1.00      1038

    accuracy                           1.00      4435
   macro avg       1.00      1.00      1.00      4435
weighted avg       1.00      1.00      1.00      4435



In [22]:
#accuracy by cross-validation
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(adaboost, X, y, cv=kfold)
print(results.mean())

0.8049717324548024


In [23]:
gboost = GradientBoostingClassifier(n_estimators=100)
#X_train, X_train_lr, y_train, y_train_lr = train_test_split(X,y,test_size=0.5)
gboost.fit(X, y)
predictions = gboost.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00       479
           2       0.96      0.99      0.98       961
           3       0.96      0.87      0.92       415
           4       0.99      0.99      0.99       470
           5       0.98      0.98      0.98      1038

    accuracy                           0.98      4435
   macro avg       0.98      0.97      0.98      4435
weighted avg       0.98      0.98      0.98      4435



In [24]:
#Estimating the accueacy bt cross-validation
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(gboost, X, y, cv=kfold)
print(results.mean())

0.8647362373660343


### VII. Gradient Boostimg for Landsat using H2o

In [25]:
#Leyendo los datos
datos= h2o.import_file("http://academic.uprm.edu/eacuna/landsat.txt")
myx=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13',
            'C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27',
           'C28','C29', 'C30','C31','C32','C33','C34','C35','C36']
datos['C37']=datos['C37'].asfactor()
myy="C37"
gbm2 = H2OGradientBoostingEstimator(model_id="gbm_covType_v1",ntrees = 100, max_depth=4,nfolds=10, sample_rate = 1,col_sample_rate = 1,seed=20000)
gbm2.train(myx, myy, training_frame=datos)
y_pred=gbm2.predict(datos)
print((y_pred['predict']==datos['C37']).mean())

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
[0.9950394588500564]


In [26]:
#Mostrando la matrix de confusion para estimar la precision out-of-bag y por validacion crizada
gbm2.confusion_matrix

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_covType_v1


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,100.0,600.0,135530.0,4.0,4.0,4.0,7.0,16.0,13.368333




ModelMetricsMultinomial: gbm
** Reported on train data. **

MSE: 0.007986146376090133
RMSE: 0.08936524143138726
LogLoss: 0.04483049751656768
Mean Per-Class Error: 0.007389093948046522

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,1,2,3,4,5,6,Error,Rate
0,1072.0,0.0,0.0,0.0,0.0,0.0,0.0,"0 / 1,072"
1,0.0,479.0,0.0,0.0,0.0,0.0,0.0,0 / 479
2,0.0,0.0,961.0,0.0,0.0,0.0,0.0,0 / 961
3,0.0,0.0,7.0,399.0,0.0,9.0,0.038554,16 / 415
4,0.0,0.0,0.0,0.0,470.0,0.0,0.0,0 / 470
5,0.0,0.0,4.0,2.0,0.0,1032.0,0.00578,"6 / 1,038"
6,1072.0,479.0,972.0,401.0,470.0,1041.0,0.004961,"22 / 4,435"



Top-6 Hit Ratios: 


Unnamed: 0,k,hit_ratio
0,1,0.995039
1,2,0.999549
2,3,1.0
3,4,1.0
4,5,1.0
5,6,1.0



ModelMetricsMultinomial: gbm
** Reported on cross-validation data. **

MSE: 0.0720374623883169
RMSE: 0.26839795526105803
LogLoss: 0.24269901109645906
Mean Per-Class Error: 0.11790647672150929

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,1,2,3,4,5,6,Error,Rate
0,1046.0,2.0,12.0,2.0,9.0,1.0,0.024254,"26 / 1,072"
1,0.0,461.0,2.0,5.0,9.0,2.0,0.037578,18 / 479
2,5.0,1.0,917.0,27.0,0.0,11.0,0.045786,44 / 961
3,2.0,7.0,73.0,260.0,2.0,71.0,0.373494,155 / 415
4,24.0,5.0,1.0,5.0,408.0,27.0,0.131915,62 / 470
5,0.0,1.0,20.0,56.0,21.0,940.0,0.094412,"98 / 1,038"
6,1077.0,477.0,1025.0,355.0,449.0,1052.0,0.090868,"403 / 4,435"



Top-6 Hit Ratios: 


Unnamed: 0,k,hit_ratio
0,1,0.909132
1,2,0.98354
2,3,0.99752
3,4,0.999098
4,5,0.999549
5,6,1.0



Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
0,accuracy,0.90917534,0.0076079755,0.89726025,0.913486,0.92124104,0.908686,0.90807176,0.9177215,0.90762126,0.9092873,0.8975501,0.91082805
1,err,0.09082468,0.0076079755,0.10273973,0.086513996,0.07875895,0.09131403,0.09192825,0.08227848,0.09237875,0.09071274,0.10244989,0.089171976
2,err_count,40.3,4.164666,45.0,34.0,33.0,41.0,41.0,39.0,40.0,42.0,46.0,42.0
3,logloss,0.24232204,0.035508305,0.26348302,0.21587239,0.20900099,0.29432887,0.20782925,0.21476933,0.2419506,0.29224464,0.27304333,0.21069802
4,max_per_class_error,0.36619616,0.07784504,0.43137255,0.23529412,0.3529412,0.35135135,0.275,0.36585367,0.40425533,0.31707317,0.4390244,0.48979592
5,mean_per_class_accuracy,0.88304317,0.011099036,0.87437147,0.9011727,0.886739,0.88072604,0.8897406,0.8878265,0.8814279,0.8893574,0.8592726,0.8797972
6,mean_per_class_error,0.11695686,0.011099036,0.12562853,0.09882732,0.113260955,0.11927393,0.110259384,0.11217353,0.11857213,0.1106426,0.14072742,0.12020277
7,mse,0.07194362,0.007837235,0.080324434,0.066498086,0.06217672,0.08263468,0.06631684,0.06608171,0.06879122,0.08010282,0.08022784,0.0662818
8,r2,0.97953445,0.0024566425,0.97766924,0.98121756,0.9831359,0.97756714,0.98107064,0.9803757,0.98018867,0.9763857,0.97596335,0.9817706
9,rmse,0.26786903,0.014522345,0.28341565,0.25787222,0.2493526,0.2874625,0.25752056,0.25706363,0.2622808,0.28302443,0.2832452,0.25745252



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error
0,,2019-11-11 12:20:42,17.652 sec,0.0,0.833333,1.791759,0.806313
1,,2019-11-11 12:20:42,17.681 sec,1.0,0.761586,1.441517,0.127847
2,,2019-11-11 12:20:42,17.707 sec,2.0,0.698646,1.215178,0.114994
3,,2019-11-11 12:20:42,17.729 sec,3.0,0.642396,1.048866,0.111387
4,,2019-11-11 12:20:42,17.749 sec,4.0,0.592087,0.919148,0.105975
5,,2019-11-11 12:20:42,17.770 sec,5.0,0.547864,0.816253,0.106426
6,,2019-11-11 12:20:42,17.786 sec,6.0,0.508222,0.729874,0.104622
7,,2019-11-11 12:20:42,17.801 sec,7.0,0.473356,0.657732,0.101466
8,,2019-11-11 12:20:42,17.821 sec,8.0,0.442903,0.59694,0.094701
9,,2019-11-11 12:20:42,17.841 sec,9.0,0.415971,0.544306,0.095378



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,C17,2636.68042,1.0,0.190932
1,C22,1897.120728,0.719511,0.137378
2,C20,1485.08374,0.56324,0.107541
3,C18,1168.854004,0.443305,0.084641
4,C34,1040.686523,0.394696,0.07536
5,C16,600.934082,0.227913,0.043516
6,C10,574.891174,0.218036,0.04163
7,C24,369.264313,0.140049,0.02674
8,C30,342.390656,0.129857,0.024794
9,C33,289.765717,0.109898,0.020983



See the whole table with table.as_data_frame()


<bound method H2OMultinomialModel.confusion_matrix of >