In [1]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np
from pprint import pprint
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from utils.model_datatransforms import *

In [2]:
df = pd.read_parquet('pred_model_data_full.parquet')

In [3]:
X = df.drop(columns=['noise_event_laeq_primary_detected_class'])

le = LabelEncoder()
y = le.fit_transform(df[['noise_event_laeq_primary_detected_class']])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=1)

  y = column_or_1d(y, warn=True)


In [4]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

In [5]:
pipeline = Pipeline(steps=[
    ('day_period_handler',
     DayPeriodHandler()
     ),
    ('month_handler',
     MonthHandler()
     ),
    ('day_of_the_week_handler',
     DayoftheWeekHandler()
     ),
    ('column_dropper',
     ColumnDropper(columns_to_drop=[
    'date', 
    'hour',
    'minute',
    'second',
    'noise_event_laeq_model_id',
    'noise_event_laeq_primary_detected_certainty'])
     ),
    ('custom_encoder',
     CustomEncoder(
        columns=['#object_id', 'day_period', 'month', 'weekday'],
        strategy='one_hot')
     ),
    ('pca',
     PCATransformer(
        n_components=7,
        columns=[
            'lamax', 'laeq', 'lceq', 'lcpeak',
            'lamax_shift_t-_1', 'laeq_shift_t-_1', 'lceq_shift_t-_1',
            'lcpeak_shift_t-_1', 'lamax_shift_t-_2', 'laeq_shift_t-_2',
            'lceq_shift_t-_2', 'lcpeak_shift_t-_2', 'lamax_shift_t-_3',
            'laeq_shift_t-_3', 'lceq_shift_t-_3', 'lcpeak_shift_t-_3',
            'lamax_shift_t-_4', 'laeq_shift_t-_4', 'lceq_shift_t-_4',
            'lcpeak_shift_t-_4', 'lamax_shift_t-_5', 'laeq_shift_t-_5',
            'lceq_shift_t-_5', 'lcpeak_shift_t-_5'
        ])
     ),
     ('xgb', XGBClassifier(random_state=42, scale_pos_weights=class_weights))
])

In [6]:
# cv = StratifiedKFold(n_splits=3, shuffle=True)
# scores3 = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, error_score='raise', n_jobs=-1)
# scores3

In [7]:
#choose parameters
n_estimators = np.arange(290,315,5)
learning_rate = np.array([0.01,0.05,0.1,0.3])
max_depth = np.arange(3,7)
eval_metric = ['mlogloss','merror']

random_grid = {
    # Parameters that we are going to tune
    'month_handler__strategy': ['month','season'],
    'day_of_the_week_handler__strategy': ['full', 'weekend'],
    'xgb__n_estimators': n_estimators,
    'xgb__learning_rate': learning_rate,
    'xgb__max_depth': max_depth,
    'xgb__eval_metric': eval_metric
}

In [8]:
cv = StratifiedKFold(n_splits=3, shuffle=True)
search = RandomizedSearchCV(pipeline, param_distributions=random_grid, n_jobs=-1, cv=cv, n_iter=1000, verbose=3, scoring='balanced_accuracy')

# Find optimal parameters
search.fit(X_train, y_train)

print("Best Score:", search.best_score_)
print("Best Estimator:", search.best_estimator_)
print("Best Parameters:", search.best_params_)



Fitting 3 folds for each of 640 candidates, totalling 1920 fits


Notes: 
- XGBClassifier(random_state=42, scale_pos_weights=class_weights, n_estimators=300, learning_rate=0.1, max_depth=5) ---- array([0.75594313, 0.75536047, 0.75539932])