## Ada boost classifier


In [101]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

In [102]:
dataset = pd.read_csv('../Data/imputedWQ.csv', parse_dates=True, header=0, index_col=0)

In [103]:
testset = pd.read_csv('../Data/testing2018.csv', parse_dates=True, header=0, index_col=0)

In [104]:
X = dataset.drop('EVENT', axis=1)  
y = dataset['EVENT']  

# TimeSeriesSplit
## Special case of KFold
### Param: n_splits according to the number of months

P.s: even though this probably would not result in accurate month separation, probably would go more with accurate samples separation

In [105]:
tss = TimeSeriesSplit(n_splits=4)
# kn = KNeighborsClassifier(n_neighbors=5, algorithm='brute') 
# sc = cross_val_score(kn, X, y, cv=tss, scoring='accuracy')

In [106]:
dataset.groupby(dataset.index.month).sum()

Unnamed: 0_level_0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8,290766.552632,6801.383362,343517.84945,30961400.0,8664862.0,787160.949453,4009652.0,67388020.0,38868670.0,165.0
9,353961.0,7172.786251,361367.891324,32520380.0,9111793.0,865092.986451,4396304.0,68585020.0,40558880.0,1310.0
10,431439.430435,7384.794348,374076.693977,33640900.0,9410047.0,710263.140783,4398462.0,62995960.0,40471840.0,174.0
11,112340.4,1736.24,88673.13,7957033.0,2056404.0,144211.26,991345.4,15339760.0,9644754.0,77.0


In [107]:
testset.groupby(testset.index.month).sum()

Unnamed: 0_level_0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,168946.9,6486.07,367918.485157,34336360.0,10300730.0,810.50142,4804.052413,65217445.0,45054296.0,773.0
2,65273.9,2560.33,144423.58,13631040.0,3474466.0,286.97,1847.039119,26277783.0,18979849.0,895.0
11,183728.9,3401.380063,177387.328683,16205040.0,4328124.0,425.34816,2341.372,30974768.0,19343072.0,352.0
12,171830.7,4713.325094,260945.894188,24177270.0,7182003.0,634.668,3317.529,44719245.0,26572835.0,309.0


## TimeSeries CV

- Applying TimeSeriesSplit in cross validation with multiple scores
    - F1-macro (alter. f1_weighted, f1_micro)
    - Precision
    - Recall
    
P.s errors result because the classifier on specific folds couldn't identify at all one class and probably gave just true or false ( I guess false in this case because of high class-imbalance)

In [108]:
scoring = ['f1','f1_micro', 'f1_macro','f1_weighted']

In [109]:
seed=7
num_trees = 30
model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), algorithm="SAMME", n_estimators=num_trees, random_state=seed)


In [110]:
results = cross_validate(model, X, y, cv=tss, scoring=scoring, n_jobs=-1)
print("Model(AdaB, 2, 30, samme): ({0}, {1}, {2},{3})\n".format( results['test_f1'].mean(), results['test_f1_micro'].mean(), results['test_f1_macro'].mean(),results['test_f1_weighted'].mean()))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Model(AdaB, 2, 30, samme): (0.3117893091473472, 0.9644430910328521, 0.6466000939097765,0.9725488355844484)



In [114]:
model.fit(X, y)

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=30, random_state=7)

In [115]:
y_pred = model.predict(X)

In [116]:
f1_score(y_pred, y)
#f1_score(y_pred, y, average='macro')

0.9372716199756396

## GridSearchCV
### Hyperparameter optimizations for Adaboost classifier

- Scoring F1_macro can be changed to precision, recall or f1_weighted or f1_micro and possibly you would get different parameters.

In [117]:
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': np.arange(3,30,100)}
grid = GridSearchCV(AdaBoostClassifier(), param_grid, cv=tss, scoring="f1", n_jobs=-1)

In [118]:
grid.fit(X,y)

  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=4),
       error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([3])}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='f1', verbose=0)

In [119]:
grid.best_params_

{'n_estimators': 3}

In [120]:
model = grid.best_estimator_

In [121]:
results = cross_validate(model, X, y, cv=tss, scoring=scoring, n_jobs=-1)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [122]:
print("Model(adab, 2, 5): ({0}, {1}, {2}, {3})\n".format(results['test_f1_macro'].mean(),results['test_f1'].mean(), results['test_f1_micro'].mean(),results['test_f1_weighted'].mean()))

Model(adab, 2, 5): (0.7449657101462652, 0.4953908266547429, 0.9892881453086375, 0.9860356774436352)



## Testing on Test set(unseen)

In [123]:
testset.fillna(0, inplace=True)

In [124]:
X_test = testset.drop('EVENT', axis=1)  
y_test = testset['EVENT']  

In [125]:
predicted = model.predict(X_test)
predicted

array([False, False, False, ..., False, False, False])

In [126]:
confusion_matrix(y, predicted)

array([[112863,  24977],
       [  1254,    472]])

In [127]:
f1_score(y_test,predicted, average='weighted')

0.8823150525553156