### Data MIning and Machine Learning
### Decision Trees using  scikit-learn and h2o
#### Edgar Acuna
#### November 2021
#### Datasets:  Diabetes and Landsat

In [35]:
import h2o
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn import tree
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import graphviz
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import warnings
warnings.filterwarnings('ignore')
#h2o.connect()
#h2o.no_progress()
h2o.init(ip="localhost", port=54323)

Checking whether there is an H2O instance running at http://localhost:54323 ..... not found.


H2OServerError: Cluster reports unhealthy status

### I Decision Trees para Diabetes usando scikit learn

In [None]:
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
#La variable de respuesta y debe ser binaria (0,1)
y=data['class']-1
X=data.iloc[:,0:8]
modeltree = tree.DecisionTreeClassifier(max_depth=3)
modeltree = modeltree.fit(X,y)

In [None]:
# Tasa de precision
modeltree.score(X, y)

Estimating the accuracy using cross validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(modeltree, X, y, cv=10)
scores
#Hallando la precision media y un intervalo de confianza 
print("CV Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

In [None]:
predictions = modeltree.predict(X)
print(classification_report(y, predictions))

In [None]:
#Estimacion de la precision con k=3 vecinos  por el metodo  "holdout 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=0)
X_train, y_train

X_test, y_test

modeltree = tree.DecisionTreeClassifier(max_depth=3)
modeltree = modeltree.fit(X_train,y_train)


In [None]:
prob3=pd.DataFrame(modeltree.predict_proba(X_test))
a=prob3.max(axis=1)
print('Probability of classification',(a[a>.90].shape[0])/prob3.shape[0])

In [None]:
#import os
#os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

In [None]:
names=names[0:8]
clases=['0','1']
dot_data = tree.export_graphviz(modeltree, out_file=None, 
                         feature_names=names,  
                         class_names=clases,  
                         filled=True, rounded=True,  
                         special_characters=True) 
graph = graphviz.Source(dot_data)
graph

### Parameters to control the size of the tree

In [None]:
modeltree1 = tree.DecisionTreeClassifier(min_samples_leaf=40)
modeltree1 = modeltree1.fit(X,y)
# Tasa de precision
modeltree1.score(X, y)

In [None]:
clases=['0','1']
dot_data = tree.export_graphviz(modeltree1, out_file=None, 
                         feature_names=names[0:8],  
                         class_names=clases,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph1= graphviz.Source(dot_data)  
graph1 

In [None]:
modeltree2 = tree.DecisionTreeClassifier(min_samples_split=100)
modeltree2 = modeltree2.fit(X,y)
# Tasa de precision
modeltree2.score(X, y)

In [None]:
clases=['0','1']
dot_data = tree.export_graphviz(modeltree2, out_file=None, 
                         feature_names=names[0:8],  
                         class_names=clases,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph2= graphviz.Source(dot_data)  
graph2 

In [None]:
modeltree3 = tree.DecisionTreeClassifier(max_leaf_nodes=5)
modeltree3 = modeltree3.fit(X,y)
# Tasa de precision
modeltree3.score(X, y)

In [None]:
clases=['0','1']
dot_data = tree.export_graphviz(modeltree3, out_file=None, 
                         feature_names=names[0:8],  
                         class_names=clases,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph3= graphviz.Source(dot_data)  
graph3 

In [None]:
modeltree4 = tree.DecisionTreeClassifier(min_impurity_decrease=.001)
modeltree4 = modeltree4.fit(X,y)
# Tasa de precision
modeltree4.score(X, y)

Estimating the accuracy by cross validation

In [None]:
scores = cross_val_score(modeltree4, X, y, cv=10)
scores
#Hallando la precision media y un intervalo de confianza 
print("CV Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

In [None]:
clases=['0','1']
dot_data = tree.export_graphviz(modeltree4, out_file=None, 
                         feature_names=names[0:8],  
                         class_names=clases,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph4= graphviz.Source(dot_data)  
graph4

### II- Applying Decision Trees  to  Diabetes using RandomForest of H20. Decision tree is a random forest with only one tree,  n=1.

In [None]:
diabetes = h2o.import_file("https://academic.uprm.edu/eacuna/diabetes.dat")
myx=['C1','C2','C3','C4','C5','C6','C7','C8']
diabetes['C9']=diabetes['C9'].asfactor()
myy="C9"
# Create test/train split
#train, test = vehicle.split_frame(ratios=[0.75], seed=1)
model=H2ORandomForestEstimator(ntrees=1,mtries=8,max_depth=3,sample_rate=1)
model.train(myx, myy, training_frame = diabetes)
model.model_performance(diabetes)

In [None]:
y_pred=model.predict(diabetes)
acc=(y_pred['predict']==diabetes['C9']).sum()/float(len(diabetes))
print("The accuracy is:", acc)

### III- Applying Decision Trees to Diabetes using Gradient Boosting of H20. Boosting is applied with a single tree 

In [None]:
gbm1 = H2OGradientBoostingEstimator(model_id="gbm_covType_v1",ntrees = 1, max_depth=3,sample_rate = 1,col_sample_rate = 1,seed=2000000
)
gbm1.train(myx, myy, training_frame=diabetes)
gbm1.model_performance(diabetes)


In [None]:
y_pred=gbm1.predict(diabetes)
acc1=(y_pred['predict']==diabetes['C9']).sum()/float(len(diabetes))
print("The accuracy is:", acc1)

### IV. Decision Trees para Landsat usando scikit-learn 

In [None]:
url='http://academic.uprm.edu/eacuna/landsat.txt'
data = pd.read_table(url, header=None,delim_whitespace=True)
y=data.iloc[:,36]-1
names=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13',
            'C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27',
           'C28','C29', 'C30','C31','C32','C33','C34','C35','C36','C37']
X=data.iloc[:,0:36]
modeltree = tree.DecisionTreeClassifier(max_depth=20)
modeltree = modeltree.fit(X,y)
# Tasa de precision
modeltree.score(X, y)
predictions = modeltree.predict(X)
print(classification_report(y, predictions))

In [None]:
dot_data = tree.export_graphviz(modeltree, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("landsat2") 

In [None]:
clases=['1',"2","3","4","5","6"]
dot1_data = tree.export_graphviz(modeltree, out_file=None, 
                         feature_names=names[0:36],
                                class_names=clases,filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot1_data)  
graph 

### V. Applying Decision trees to a dataset (adult) containing missing values (using h2o)

In [None]:
#Leyendo los datos
datos= h2o.import_file("https://academic.uprm.edu/eacuna/census.csv",na_strings=[' ?'])
myx=['age', 'employment', 'final-weight', 'education', 'education.num',
       'marital.status', 'job', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours.per.week', 'native.country']
#myx=['C1', 'C2', 'C3', 'C4', 'C5','C6','C7','C8','C9','C10','C11', 'C12','C13','C14']
datos['salary']=datos['salary'].asfactor()
myy="salary"
model=H2ORandomForestEstimator(ntrees=1,mtries=14,max_depth=3,sample_rate=1)
model.train(myx, myy, training_frame = datos)
model.model_performance(datos)

In [None]:
y_pred=model.predict(datos)
acc2=(y_pred['predict']==datos['salary']).sum()/float(len(datos))
print("The accuracy is:", acc2)