### ESMA 4016
### Ensembles de Arboles con  scikit-learn
#### Edgar Acuna

In [1]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn import metrics
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

### I. Bagging para Diabetes usando scikit learn

In [3]:
url= "http://academic.uprm.edu/eacuna/diabetes.dat"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_table(url, names=names,header=None)
#La variable de respuesta y debe ser binaria (0,1)
y=data['class']-1
X=data.iloc[:,0:8]
modeltree = tree.DecisionTreeClassifier(max_depth=3)
bagging = BaggingClassifier(modeltree,n_estimators=100, max_features=1.0)

In [4]:
# Tasa de precision
bagging.fit(X, y)
predictions = bagging.predict(X)
print(classification_report(y, predictions))

             precision    recall  f1-score   support

          0       0.79      0.91      0.85       500
          1       0.77      0.56      0.65       268

avg / total       0.78      0.79      0.78       768



In [5]:
#Estimando la precision por validacion cruzada
kfold = model_selection.KFold(n_splits=10, random_state=99)
results = model_selection.cross_val_score(bagging, X, y, cv=kfold)
print(results.mean())

0.756527682843


### II. AdaBoosting para Diabetes usando scikit-learn

In [6]:
adaboost = AdaBoostClassifier(modeltree,n_estimators=100,learning_rate=1)
adaboost.fit(X, y)
predictions = adaboost.predict(X)
print(classification_report(y, predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       500
          1       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768



In [7]:
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(adaboost, X, y, cv=kfold)
print(results.mean())

0.7252734108


### III. Gradient Boosting para Diabetes usando scikit-learn

In [8]:
gboost = GradientBoostingClassifier(n_estimators=100)
#X_train, X_train_lr, y_train, y_train_lr = train_test_split(X,y,test_size=0.5)
gboost.fit(X, y)
predictions = gboost.predict(X)
print(classification_report(y, predictions))

             precision    recall  f1-score   support

          0       0.90      0.96      0.93       500
          1       0.91      0.81      0.86       268

avg / total       0.91      0.91      0.90       768



In [9]:
kfold = model_selection.KFold(n_splits=10, random_state=999)
results = model_selection.cross_val_score(gboost, X, y, cv=kfold)
print(results.mean())

0.766900205058


### IV. Decision Trees para Landsat usando scikit-learn 

In [10]:
url='http://academic.uprm.edu/eacuna/landsat.txt'
data = pd.read_table(url, header=None,delim_whitespace=True)
y=data.iloc[:,36]-1
names=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13',
            'C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27',
           'C28','C29', 'C30','C31','C32','C33','C34','C35','C36','C37']
X=data.iloc[:,0:36]
modeltree = tree.DecisionTreeClassifier(max_depth=3)
bagging = BaggingClassifier(modeltree,n_estimators=100, max_features=1.0)
# Tasa de precision
bagging.fit(X, y)
predictions = bagging.predict(X)
print(classification_report(y, predictions))

             precision    recall  f1-score   support

          0       0.71      0.93      0.81      1072
          1       0.84      0.91      0.87       479
          2       0.87      0.93      0.90       961
          3       0.68      0.13      0.22       415
          4       0.96      0.47      0.64       470
          5       0.76      0.86      0.81      1038

avg / total       0.80      0.79      0.76      4435



In [11]:
kfold = model_selection.KFold(n_splits=10, random_state=99)
results = model_selection.cross_val_score(modeltree, X, y, cv=kfold)
print(results.mean())

0.774498200232
