In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from pprint import pprint
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from utils.datatransforms import *

In [None]:
df = pd.read_parquet('pred_model_data_full.parquet')

In [None]:
X = df.drop(columns=['noise_event_laeq_primary_detected_class'])

le = LabelEncoder()
y = le.fit_transform(df[['noise_event_laeq_primary_detected_class']])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=1)

In [None]:
pipeline = Pipeline(steps=[
    ('day_period_handler',
     DayPeriodHandler()
     ),
    ('month_handler',
     MonthHandler()
     ),
    ('day_of_the_week_handler',
     DayoftheWeekHandler()
     ),
    ('column_dropper',
     ColumnDropper(columns_to_drop=[
    'date', 
    'hour',
    'minute',
    'second',
    'noise_event_laeq_model_id',
    'noise_event_laeq_primary_detected_certainty'])
     ),
    ('custom_encoder',
     CustomEncoder(
        columns=['#object_id', 'day_period', 'month', 'weekday'],
        strategy='one_hot')
     ),
    ('pca',
     PCATransformer(
        n_components=7,
        columns=[
            'lamax', 'laeq', 'lceq', 'lcpeak',
            'lamax_shift_t-_1', 'laeq_shift_t-_1', 'lceq_shift_t-_1',
            'lcpeak_shift_t-_1', 'lamax_shift_t-_2', 'laeq_shift_t-_2',
            'lceq_shift_t-_2', 'lcpeak_shift_t-_2', 'lamax_shift_t-_3',
            'laeq_shift_t-_3', 'lceq_shift_t-_3', 'lcpeak_shift_t-_3',
            'lamax_shift_t-_4', 'laeq_shift_t-_4', 'lceq_shift_t-_4',
            'lcpeak_shift_t-_4', 'lamax_shift_t-_5', 'laeq_shift_t-_5',
            'lceq_shift_t-_5', 'lcpeak_shift_t-_5'
        ])
     ),
     ('xgb', XGBClassifier(random_state=42))
])

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
scores3 = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, error_score='raise', n_jobs=-1)
scores3

In [None]:
#choose parameters
random_grid = {
    # Parameters that we are going to tune
    'month_handler__strategy': ['month','season'],
    'day_of_the_week_handler__strategy': ['full', 'weekend'],
    'xgb__n_estimators': Integer(1, 200),
    'xgb__max_delta_step': Integer(1, 50),
    'xgb__max_leaves': Integer(0, 10),
    'xgb__num_parallel_trees': Integer(1, 10),
    'xgb__max_depth': Integer(1, 50),
    'xgb__min_child_weight': Integer(1, 7),
    'xgb__subsample': Real(0.000001, 0.001),
    'xgb__colsample_bytree': Real(0.000001, 0.001),
    'xgb__learning_rate': Real(0.0001, 0.01),
    'xgb__gamma': Real(0.0001, 0.1),
    'xgb__objective': ['multi:softmax']
}

In [None]:
cv = StratifiedKFold(n_splits=3, shuffle=True)
search = RandomizedSearchCV(pipeline, param_distributions=random_grid, n_jobs=-1, cv=cv, n_iter=100, verbose=0, scoring='accuracy')

# Find optimal parameters
search.fit(X, y)

print("Best Score:", search.best_score_)
print("Best Estimator:", search.best_estimator_)
print("Best Parameters:", search.best_params_)