## Ada boost classifier


In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import preprocessing

In [2]:
dataset = pd.read_csv('../Data/imputedWQ.csv', parse_dates=True, header=0, index_col=0)

In [3]:
testset = pd.read_csv('../Data/testing2018-mean_imputed.csv', parse_dates=True, header=0, index_col=0)

In [4]:
X = dataset.drop('EVENT', axis=1)  
y = dataset['EVENT']  

In [5]:
X.drop(['Leit','Cl'], axis=1,inplace=True)
X.head(2)

Unnamed: 0_level_0,Tp,pH,Redox,Trueb,Cl_2,Fm,Fm_2
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-08-03 09:49:00,6.5,8.36,749.0,11.0,118.0,1677.0,695.0
2016-08-03 09:50:00,6.5,8.36,749.0,11.0,118.0,1561.0,696.0


## Classifiers

In [6]:
X_test = testset.drop('EVENT', axis=1)  
y_test = testset['EVENT']  

In [7]:
X_test.drop(['Leit','Cl'], axis=1,inplace=True)
X_test.head(2)

Unnamed: 0_level_0,Tp,pH,Redox,Trueb,Cl_2,Fm,Fm_2
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-11-08 07:55:00,10.1,8.41,762.0,0.022,0.106,1818.0,920.0
2016-11-08 07:56:00,10.1,8.41,762.0,0.022,0.106,1805.0,927.0


### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(max_depth=4, n_estimators= 300, random_state=0)

rf_clf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
predicted = rf_clf.predict(X_test)

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predicted)

array([[115437,    200],
       [  1597,    732]])

In [11]:
print(f1_score(y_test,predicted))
print(recall_score(y_test,predicted))
print(precision_score(y_test,predicted))

0.4489420423183073
0.3142979819665092
0.7854077253218884


### Ada boost

In [12]:
seed=7

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4), algorithm="SAMME", n_estimators=300, random_state=seed)

In [13]:
ada_clf.fit(X,y)

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=300, random_state=7)

In [14]:
ada_pred = ada_clf.predict(X_test)

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,ada_pred)

array([[114024,   1613],
       [  1442,    887]])

In [16]:
print(f1_score(y_test,ada_pred))
print(recall_score(y_test,ada_pred))
print(precision_score(y_test,ada_pred))

0.36736384344584805
0.38085015027908975
0.3548


### SVM

In [8]:
scaler = preprocessing.StandardScaler()
X_train_standard = scaler.fit_transform(X)

svm_clf = SVC(C=100, gamma=0.01, kernel='linear', class_weight={1: 10}) 

In [9]:
X_test_standard = scaler.transform(X_test) 

In [None]:
svm_clf.fit(X_train_standard, y)

In [None]:
svm_pred = svm_clf.predict(X_test_standard)

In [60]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,svm_pred)

array([[115366,    270],
       [  1859,    470]])

In [61]:
print(f1_score(y_test,svm_pred))
print(recall_score(y_test,svm_pred))
print(precision_score(y_test,svm_pred))

0.30628869338546755
0.201803349076857
0.6351351351351351
