In [1]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow import keras

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
df_delta = pd.read_csv('data/datasets/df_delta.csv') # cointains labels for delta VG
df_ok = pd.read_csv('data/datasets/df_ok.csv')  #contains labels for ok VG

df_raw = pd.read_csv('data/datasets/df_raw.csv') ## contains full reading of each VG

df_labels = pd.concat([df_ok, df_delta], sort=False, axis=0)


df_VG = pd.merge(df_raw, df_labels, on =['gauge_id','fillNumber'])
df_VG = df_VG.set_index(['gauge_id','fillNumber'], drop=True)

## Removing categorical values
df_VG.y.replace(to_replace=['ok', 'delta'], value=[0, 1], inplace=True)

In [3]:
X = np.array(df_VG.iloc[:, :-1])
y = np.array(df_VG.iloc[:, -1])

In [36]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, RobustScaler


class RowScaler(BaseEstimator, TransformerMixin):
    def __init__(self, scaling_method='Standard'):
        self.scaling_options = ['Standard', 'MinMax', 'MaxAbs', 'Robust']
        assert (scaling_method in self.scaling_options), 'scaling_method:' + scaling_method + ' not in ' + str(self.scaling_options)
        self.scaling_method = scaling_method

        return None

    def fit(self, X, y=None):
        if self.scaling_method == 'Robust':
            self.scaler = RobustScaler()
        elif self.scaling_method == 'MinMax':
            self.scaler = MinMaxScaler()
        elif self.scaling_method == 'Standard':
            self.scaler = StandardScaler()
        elif self.scaling_method == 'MaxAbs':
            self.scaler = MaxAbsScaler()
        return self
    
    def transform(self, X, y=None):
        return self.scaler.fit_transform(X.transpose()).transpose()

scaler = RowScaler('Standard')
X_norm = scaler.fit_transform(X)
# X_norm[0].shape

In [37]:
knn_dir = 'data/models/knn'
knn_name = 'knn_001.pkl'
forest_dir = 'data/models/random_forest'
forest_name = 'forest_001.pkl'

log_dir = 'data/models/logistic_classifier'
log_name = 'logistic_001.pkl'

log_path = os.path.join(log_dir, log_name)
knn_path = os.path.join(knn_dir, knn_name)
forest_path = os.path.join(forest_dir, forest_name)




log = joblib.load(log_path)
knn = joblib.load(knn_path)
forest = joblib.load(forest_path)

In [39]:
from sklearn.ensemble import VotingClassifier

forest_clf = forest.best_estimator_
knn_clf = knn.best_estimator_
log_clf = log.best_estimator_

voting_clf = VotingClassifier(
    estimators=[('hard', forest_clf), ('knn', knn_clf), ('log', log_clf)],
    voting='soft'
)

In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(cross_val_score(knn_clf,X, y, scoring='recall', cv=strat_kfold))
np.mean(cross_val_score(knn_clf,X, y, scoring='recall', cv=strat_kfold))

[0.92857143 0.92857143 1.         1.         0.96296296]


0.9640211640211641

In [41]:
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for clf in (log_clf, knn_clf, forest_clf, voting_clf):
    score = cross_val_score(clf,X, y, scoring='recall', cv=strat_kfold)
    print(clf.__class__.__name__, np.mean(score), np.std(score))


Pipeline 0.9640211640211641 0.031948209818942905
Pipeline 0.9640211640211641 0.031948209818942905
Pipeline 0.9203703703703704 0.043303896559524976
VotingClassifier 0.9640211640211641 0.031948209818942905
