In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from pprint import pprint
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from utils.datatransforms import *

In [None]:
df = pd.read_parquet('pred_model_data_full.parquet')

In [None]:
X = df.drop(columns=['noise_event_laeq_primary_detected_class'])

le = LabelEncoder()
y = le.fit_transform(df[['noise_event_laeq_primary_detected_class']])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=1)

In [None]:
pipeline = Pipeline(steps=[
    ('day_period_handler',
     DayPeriodHandler()
     ),
    ('month_handler',
     MonthHandler()
     ),
    ('day_of_the_week_handler',
     DayoftheWeekHandler()
     ),
    ('column_dropper',
     ColumnDropper(columns_to_drop=[
    'date', 
    'hour',
    'minute',
    'second',
    'noise_event_laeq_model_id',
    'noise_event_laeq_primary_detected_certainty'])
     ),
    ('custom_encoder',
     CustomEncoder(
        columns=['#object_id', 'day_period', 'month', 'weekday'],
        strategy='one_hot')
     ),
    ('pca',
     PCATransformer(
        n_components=7,
        columns=[
            'lamax', 'laeq', 'lceq', 'lcpeak',
            'lamax_shift_t-_1', 'laeq_shift_t-_1', 'lceq_shift_t-_1',
            'lcpeak_shift_t-_1', 'lamax_shift_t-_2', 'laeq_shift_t-_2',
            'lceq_shift_t-_2', 'lcpeak_shift_t-_2', 'lamax_shift_t-_3',
            'laeq_shift_t-_3', 'lceq_shift_t-_3', 'lcpeak_shift_t-_3',
            'lamax_shift_t-_4', 'laeq_shift_t-_4', 'lceq_shift_t-_4',
            'lcpeak_shift_t-_4', 'lamax_shift_t-_5', 'laeq_shift_t-_5',
            'lceq_shift_t-_5', 'lcpeak_shift_t-_5'
        ])
     ),
     ('lightgbm', LGBMClassifier(random_state=42))
])

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
scores3 = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, error_score='raise', n_jobs=-1)
scores3

In [None]:
#choose the parameters
n_estimators = Integer(1, 200)
learning_rate = Real(0.0001, 0.5)
num_leaves = Integer(2, 50)
max_depth = Integer(-1, 50)
min_data_in_leaf = Integer(0, 50)
bagging_fraction = Real(0.0001, 0.1)
lambda_l1 = Real(0.0001, 0.5)
lambda_l2 = Real(0.0001, 0.5)
max_bin = Integer(100, 400)

params_grid = {
    'month_handler__strategy': ['month','season'],
    'day_of_the_week_handler__strategy': ['full', 'weekend'],
    'lightgbm__n_estimators': n_estimators,
    'lightgbm__learning_rate': learning_rate,
    'lightgbm__num_leaves': num_leaves,
    'lightgbm__max_depth': max_depth,
    'lightgbm__min_data_in_leaf': min_data_in_leaf,
    'lightgbm__bagging_fraction': bagging_fraction,
    'lightgbm__lambda_l1': lambda_l1,
    'lightgbm__lambda_l2': lambda_l2,
    'lightgbm__max_bin': max_bin,
}

In [None]:
cv = KFold(n_splits=10)
search = RandomizedSearchCV(pipeline, param_distributions=params_grid, n_jobs=-1, cv=cv, n_iter=100, scoring='accuracy')

# Find optimal parameters
search.fit(X, y)

print("Best Score:", search.best_score_)
print("Best Estimator:", search.best_estimator_)
print("Best Parameters:", search.best_params_)