# Boosting Methods

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,auc,roc_curve

### Get Data

In [2]:
df = pd.read_csv('C:/Users/Desktop/python/03 advance/overfitting.csv')
df = df.drop(['Target_Leaderboard','Target_Evaluate'],axis=1)
train = df[df['train']==1]
train = train.drop(['case_id','train'],axis=1)
test = df[df['train']==0]
test = test.drop(['case_id','train'],axis=1)
y_train = train['Target_Practice']
y_test = test['Target_Practice']
x_train = train.drop(['Target_Practice'],axis=1)
x_test = test.drop(['Target_Practice'],axis=1)

### ADA Boost

In [3]:
ada=AdaBoostClassifier()
paramgrid = {'n_estimators': [2000], 'learning_rate': [0.001,0.01,0.1,1]}
grid = GridSearchCV(ada,paramgrid,refit=True,n_jobs=-1,cv=10)
grid.fit(x_train,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [2000], 'learning_rate': [0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [4]:
print(grid.best_params_)
print(grid.best_estimator_)

{'learning_rate': 1, 'n_estimators': 2000}
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=2000, random_state=None)


In [5]:
predictions = grid.predict(x_test)
predictions

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

### Evaluation

In [6]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.69      0.67      0.68      9909
          1       0.68      0.69      0.68      9841

avg / total       0.68      0.68      0.68     19750



In [7]:
print('accuracy %s' % accuracy_score(y_test, predictions))

accuracy 0.682126582278


In [8]:
print('confusion matrix\n %s' % confusion_matrix(y_test, predictions))

confusion matrix
 [[6667 3242]
 [3036 6805]]


### GBM

In [9]:
grd = GradientBoostingClassifier()
paramgrid= {'n_estimators':[25], 'loss':['deviance','exponential'], 'learning_rate':[0.1,1], 
            'max_features':['auto','log2'], 'max_depth':[3,7,10,25,50,100]}
grid  = GridSearchCV(grd,paramgrid,n_jobs=-1,refit=True,verbose=2,cv=5)
grid.fit(x_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 156 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   18.9s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [25], 'loss': ['deviance', 'exponential'], 'learning_rate': [0.1, 1], 'max_features': ['auto', 'log2'], 'max_depth': [3, 7, 10, 25, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [10]:
print(grid.best_params_)
print(grid.best_estimator_)

{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': 'log2', 'n_estimators': 25}
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='log2', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=25,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)


In [11]:
predictions = grid.predict(x_test)
predictions

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

### Evaluation

In [12]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.60      0.51      0.55      9909
          1       0.57      0.65      0.61      9841

avg / total       0.58      0.58      0.58     19750



In [13]:
print('accuracy %s' % accuracy_score(y_test, predictions))

accuracy 0.580759493671


In [14]:
print('confusion matrix\n %s' % confusion_matrix(y_test, predictions))

confusion matrix
 [[5058 4851]
 [3429 6412]]
