In [18]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from pprint import pprint
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from utils.datatransforms import *

In [2]:
df = pd.read_parquet('pred_model_data_full.parquet')

In [3]:
X = df.drop(columns=['noise_event_laeq_primary_detected_class'])

le = LabelEncoder()
y = le.fit_transform(df[['noise_event_laeq_primary_detected_class']])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=1)

  y = column_or_1d(y, warn=True)


In [19]:
pipeline = Pipeline(steps=[
    ('day_period_handler',
     DayPeriodHandler()
     ),
    ('month_handler',
     MonthHandler()
     ),
    ('day_of_the_week_handler',
     DayoftheWeekHandler()
     ),
    ('column_dropper',
     ColumnDropper(columns_to_drop=[
    'date', 
    'hour',
    'minute',
    'second',
    'noise_event_laeq_model_id',
    'noise_event_laeq_primary_detected_certainty'])
     ),
    ('custom_encoder',
     CustomEncoder(
        columns=['#object_id', 'day_period', 'month', 'weekday'],
        strategy='one_hot')
     ),
    ('pca',
     PCATransformer(
        n_components=7,
        columns=[
            'lamax', 'laeq', 'lceq', 'lcpeak',
            'lamax_shift_t-_1', 'laeq_shift_t-_1', 'lceq_shift_t-_1',
            'lcpeak_shift_t-_1', 'lamax_shift_t-_2', 'laeq_shift_t-_2',
            'lceq_shift_t-_2', 'lcpeak_shift_t-_2', 'lamax_shift_t-_3',
            'laeq_shift_t-_3', 'lceq_shift_t-_3', 'lcpeak_shift_t-_3',
            'lamax_shift_t-_4', 'laeq_shift_t-_4', 'lceq_shift_t-_4',
            'lcpeak_shift_t-_4', 'lamax_shift_t-_5', 'laeq_shift_t-_5',
            'lceq_shift_t-_5', 'lcpeak_shift_t-_5'
        ])
     ),
     ('randomForest', RandomForestClassifier(random_state=42, min_samples_leaf=15, class_weight='balanced'))
])

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
scores3 = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, error_score='raise', n_jobs=-1)
scores3

In [35]:
n_estimators = np.arange(100, 150) # number of trees in the random forest
max_depth = np.arange(2,5) # maximum number of levels allowed in each decision tree
min_samples_split = np.arange(2,5) # minimum sample number to split a node
ccp_alpha = np.linspace(0.4,0.5,51)

random_grid = {
    'month_handler__strategy': ['month','season'],
    'day_of_the_week_handler__strategy': ['full', 'weekend'],
    'randomForest__n_estimators': n_estimators,
    'randomForest__max_depth': max_depth,
    'randomForest__min_samples_split': min_samples_split,
    'randomForest__ccp_alpha': ccp_alpha   
}

In [36]:
cv = StratifiedKFold(n_splits=3, shuffle=True)
search = RandomizedSearchCV(pipeline, param_distributions=random_grid, n_jobs=-1, cv=cv, n_iter=100, verbose=0, scoring='accuracy')

# Find optimal parameters
search.fit(X, y)

print("Best Score:", search.best_score_)
print("Best Estimator:", search.best_estimator_)
print("Best Parameters:", search.best_params_)

KeyboardInterrupt: 

In [32]:
search.cv_results_

{'mean_fit_time': array([68.81739132, 37.08276113, 48.98945093, 65.21067572, 57.63273931,
        35.39752086, 37.69644245, 34.59800514, 45.04549003, 38.50225584]),
 'std_fit_time': array([0.60676451, 0.26089242, 0.34659575, 2.04839156, 3.23184778,
        3.96804505, 0.16594345, 0.12605561, 1.90769692, 0.72690349]),
 'mean_score_time': array([1.72675959, 1.0657177 , 1.21255167, 0.99675441, 1.00081086,
        0.77038987, 0.89896321, 0.81141392, 0.79435738, 0.5049692 ]),
 'std_score_time': array([0.04492973, 0.01953833, 0.03279653, 0.07167177, 0.02186923,
        0.03874016, 0.03265078, 0.00952256, 0.05374609, 0.03727391]),
 'param_randomForest__n_estimators': masked_array(data=[192, 102, 137, 190, 174, 117, 141, 128, 199, 172],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_randomForest__ccp_alpha': masked_array(data=[0.227, 0.458, 0.23900000000000002, 0.41600000