## Useful links:

https://stats.stackexchange.com/questions/14099/using-k-fold-cross-validation-for-time-series-model-selection

https://robjhyndman.com/hyndsight/crossvalidation/

https://towardsdatascience.com/time-series-nested-cross-validation-76adba623eb9


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_selection import SelectFromModel

In [36]:
dataset = pd.read_csv('../Data/imputedWQ.csv', parse_dates=True, header=0, index_col=0)

In [37]:
testset = pd.read_csv('../Data/testing2018-mean_imputed.csv', parse_dates=True, header=0, index_col=0)

In [38]:
X = dataset.drop('EVENT', axis=1)  
y = dataset['EVENT']  

In [39]:
X.drop([ 'Leit','Cl'], axis=1,inplace=True)
X.head(2)

Unnamed: 0_level_0,Tp,pH,Redox,Trueb,Cl_2,Fm,Fm_2
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-08-03 09:49:00,6.5,8.36,749.0,11.0,118.0,1677.0,695.0
2016-08-03 09:50:00,6.5,8.36,749.0,11.0,118.0,1561.0,696.0


In [40]:
# y_pred = svclassifier.predict(X_test)  

In [41]:
# print(classification_report(y_test,y_pred)) 

# TimeSeriesSplit
## Special case of KFold
### Param: n_splits according to the number of months

P.s: even though this probably would not result in accurate month separation, probably would go more with accurate samples separation

In [15]:
tss = TimeSeriesSplit(n_splits=4)
# kn = KNeighborsClassifier(n_neighbors=5, algorithm='brute') 
# sc = cross_val_score(kn, X, y, cv=tss, scoring='accuracy')

In [9]:
dataset.groupby(dataset.index.month).sum()

Unnamed: 0_level_0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8,290766.552632,6801.383362,343517.84945,30961400.0,8664862.0,787160.949453,4009652.0,67388020.0,38868670.0,165.0
9,353961.0,7172.786251,361367.891324,32520380.0,9111793.0,865092.986451,4396304.0,68585020.0,40558880.0,1310.0
10,431439.430435,7384.794348,374076.693977,33640900.0,9410047.0,710263.140783,4398462.0,62995960.0,40471840.0,174.0
11,112340.4,1736.24,88673.13,7957033.0,2056404.0,144211.26,991345.4,15339760.0,9644754.0,77.0


In [10]:
testset.groupby(testset.index.month).count()

Unnamed: 0_level_0,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,44640,44640,44640,44640,44640,44640,44640,44640,44640,44640
2,17641,17641,17641,17641,17641,17641,17641,17641,17641,17641
11,21293,21293,21293,21293,21293,21293,21293,21293,21293,32645
12,31512,31512,31512,31512,31512,31512,31512,31512,31512,44640


## TimeSeries CV

- Applying TimeSeriesSplit in cross validation with multiple scores
    - F1-macro (alter. f1_weighted, f1_micro)
    - Precision
    - Recall
    
P.s errors result because the classifier on specific folds couldn't identify at all one class and probably gave just true or false ( I guess false in this case because of high class-imbalance)

In [16]:
scoring = ['f1', 'f1_macro', 'f1_micro', 'precision', 'recall']

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=0)

results = cross_validate(rf_clf, X, y, cv=tss, scoring=scoring, n_jobs=-1)

In [18]:
print("Model(RF, 200,8): ({0}, {1}, {2}, {3},{4})\n".format(results['test_f1'].mean(),results['test_f1_micro'].mean() , results['test_f1_macro'].mean(), results['test_precision'].mean(), results['test_recall'].mean()))

Model(RF, 200,8): (0.7719758102661711, 0.9917153297746569, 0.8838664664584284, 0.9147676282051282,0.6912304345239684)



In [19]:
y.unique()

array([False,  True])

In [20]:
#results = cross_validate(svc, X, y, cv=tss, scoring=scoring, n_jobs=-1)

In [21]:
#print("Model(SVC, linear): ({0}, {1}, {2})\n".format(results['test_f1_macro'].mean(), results['test_precision'].mean(), results['test_recall'].mean()))

## GridSearchCV
### Hyperparameter optimizations for RandForest classifier

- Scoring F1_macro can be changed to precision, recall or f1_weighted or f1_micro and possibly you would get different parameters.

In [22]:
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': np.arange( 100, 200),
              'max_depth':np.arange(8,9,20)}

grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=tss, scoring="f1", n_jobs=-1)

In [42]:
grid.fit(X,y)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=4),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([100, 101, ..., 198, 199]), 'max_depth': array([8])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [43]:
grid.best_params_

{'max_depth': 8, 'n_estimators': 192}

In [44]:
model = grid.best_estimator_

In [45]:
results = cross_validate(model, X, y, cv=tss, scoring=scoring, n_jobs=-1)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [46]:
print("Model(RF, 2): ({0}, {1}, {2}, {3},{4})\n".format(results['test_f1'].mean(),results['test_f1_micro'].mean() , results['test_f1_macro'].mean(), results['test_precision'].mean(), results['test_recall'].mean()))

Model(RF, 2): (0.33905504108659335, 0.9873535628560168, 0.6663111670369284, 0.5324137931034483,0.36968190982522375)



## Testing on Test set

In [47]:
X_test = testset.drop('EVENT', axis=1)  
y_test = testset['EVENT']  

In [48]:
X_test.drop([ 'Leit','Cl'], axis=1,inplace=True)
X_test.head(2)

Unnamed: 0_level_0,Tp,pH,Redox,Trueb,Cl_2,Fm,Fm_2
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-11-08 07:55:00,10.1,8.41,762.0,0.022,0.106,1818.0,920.0
2016-11-08 07:56:00,10.1,8.41,762.0,0.022,0.106,1805.0,927.0


In [49]:
predicted = model.predict(X_test)

In [50]:
predicted 

array([False, False, False, ..., False, False, False])

In [55]:
print(f1_score(y_test,predicted))
print(recall_score(y_test,predicted))
print(precision_score(y_test,predicted))

0.49357326478149105
0.37097466723915845
0.7372013651877133


In [52]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test ,predicted)

array([[115329,    308],
       [  1465,    864]])