## Ada boost classifier


In [2]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

In [38]:
dataset = pd.read_csv('../Data/imputedWQ.csv', parse_dates=True, header=0, index_col=0)

In [39]:
testset = pd.read_csv('../Data/testing2018-mean_imputed.csv', parse_dates=True, header=0, index_col=0)

In [40]:
cols = list(dataset.columns.values)
cols = cols[:-1]


In [41]:
dataset[cols] = dataset[cols].rolling(window=60).mean()


In [42]:
dataset = dataset.iloc[60:]
dataset.head(3)

Unnamed: 0_level_0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-08-03 10:49:00,6.528333,0.172333,8.364833,748.816667,209.95,7.120333,107.56,1593.616667,694.766667,False
2016-08-03 10:50:00,6.528333,0.172333,8.364833,748.816667,209.85,7.120333,107.56,1596.466667,694.766667,False
2016-08-03 10:51:00,6.528333,0.172333,8.365,748.816667,209.783333,7.120333,107.576667,1598.216667,694.75,False


In [7]:
X = dataset.drop('EVENT', axis=1)  
y = dataset['EVENT']

# TimeSeriesSplit
## Special case of KFold
### Param: n_splits according to the number of months

P.s: even though this probably would not result in accurate month separation, probably would go more with accurate samples separation

In [8]:
tss = TimeSeriesSplit(n_splits=4)
# kn = KNeighborsClassifier(n_neighbors=5, algorithm='brute') 
# sc = cross_val_score(kn, X, y, cv=tss, scoring='accuracy')

In [9]:
dataset.groupby(dataset.index.month).sum()

Unnamed: 0_level_0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8,290732.052632,6800.553362,343476.07345,30957640.0,8663815.0,787073.749453,4009076.0,67379850.0,38864030.0,165.0
9,353958.6,7172.786251,361367.899324,32520370.0,9111787.0,865114.186451,4396318.0,68585810.0,40559680.0,1310.0
10,431435.230435,7384.774348,374076.653977,33640910.0,9410055.0,710264.740783,4398464.0,62995760.0,40471690.0,174.0
11,112341.8,1736.24,88673.05,7957010.0,2056437.0,144198.66,991339.3,15338890.0,9644805.0,77.0


In [10]:
testset.groupby(testset.index.month).sum()

Unnamed: 0_level_0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,168946.9,6486.07,367918.485157,34336360.0,10300730.0,810.50142,4804.052413,65217440.0,45054300.0,773.0
2,65273.9,2560.33,144423.58,13631040.0,3474466.0,286.97,1847.039119,26277780.0,18979850.0,895.0
11,194088.625698,3601.81407,187980.06598,17174080.0,4600109.0,450.479843,2479.73187,32756350.0,20559810.0,352.0
12,180976.84273,4957.260529,274366.671574,25411000.0,7547389.0,669.072158,3490.739109,47059280.0,27860140.0,309.0


## TimeSeries CV

- Applying TimeSeriesSplit in cross validation with multiple scores
    - F1-macro (alter. f1_weighted, f1_micro)
    - Precision
    - Recall
    
P.s errors result because the classifier on specific folds couldn't identify at all one class and probably gave just true or false ( I guess false in this case because of high class-imbalance)

In [9]:
scoring = ['f1', 'f1_macro', 'f1_micro', 'recall','precision']

In [17]:
seed=7
num_trees = 50

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=9), algorithm="SAMME", n_estimators=num_trees, random_state=seed)
results = cross_validate(ada_clf, X, y, cv=tss, scoring=scoring, n_jobs=-1)

In [18]:
print("Model(RF, 2): ({0}, {1}, {2}, {3},{4})\n".format(results['test_f1'].mean(),results['test_f1_micro'].mean(), results['test_f1_macro'].mean(), results['test_precision'].mean(), results['test_recall'].mean()))

Model(RF, 2): (0.32812547058422936, 0.9862872599598739, 0.6605756313405685, 0.47525354320632174,0.30974253623250037)



In [21]:
ada_clf.fit(X, y)

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=7)

In [23]:
y_pred = ada_clf.predict(X)

In [24]:
f1_score(y_pred, y)
#f1_score(y_pred, y, average='macro')

0.9991316931982634

## GridSearchCV
### Hyperparameter optimizations for Adaboost classifier

- Scoring F1_macro can be changed to precision, recall or f1_weighted or f1_micro and possibly you would get different parameters.

In [3]:
np.arange(10,300,10)

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290])

In [25]:
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': np.arange(10,300,10),
              'base_estimator': [DecisionTreeClassifier(max_depth=3), DecisionTreeClassifier(max_depth=4), DecisionTreeClassifier(max_depth=5)]}
grid = GridSearchCV(AdaBoostClassifier(), param_grid, cv=tss, scoring="f1", n_jobs=-1)

In [26]:
grid.fit(X,y)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=4),
       error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290]), 'base_estimator': [DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            ...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [27]:
grid.best_params_

{'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'), 'n_estimators': 20}

In [28]:
model = grid.best_estimator_

In [43]:
results = cross_validate(model, X, y, cv=tss, scoring=scoring, n_jobs=-1)

In [44]:
print("Model(adab, 2, 20): ({0}, {1}, {2}, {3})\n".format(results['test_f1_macro'].mean(),results['test_f1'].mean(), results['test_f1_micro'].mean(),results['test_recall'].mean(),results['test_precision'].mean()))

Model(adab, 2, 20): (0.6284595899226475, 0.2636638560381369, 0.9867530094582975, 0.21907385291058287)



## Testing on Test set(unseen)

In [56]:
cols = list(testset.columns.values)
cols = cols[:-1]
testset[cols] = testset[cols].rolling(window=60).mean()
testset = testset.iloc[60:]

In [57]:
X_test = testset.drop('EVENT', axis=1)  
y_test = testset['EVENT']  

In [58]:
predicted = model.predict(X_test)
predicted

array([False, False, False, ..., False, False, False])

In [59]:
f1_score(y_test,predicted)

0.14011299435028246

In [60]:
confusion_matrix(y_test, predicted)

array([[115357,    140],
       [  2143,    186]])

In [61]:
print(f1_score(y_test,predicted))
print(recall_score(y_test,predicted))
print(precision_score(y_test,predicted))

0.14011299435028246
0.0798626019750966
0.5705521472392638
