In [5]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from bayes_opt import BayesianOptimization
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

In [10]:
def encoder(df, col_split):
    continuous_cols = df.columns[col_split:]
    categorical_cols = df.columns[:col_split]

    # Standardizing the continuous features
    scaler = StandardScaler()
    df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

    # Applying One-Hot Encoding to categorical features
    encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=True), categorical_cols)], remainder='passthrough')
    return encoder.fit_transform(df)


In [11]:
def load_data():
    df = pd.read_csv('beta_dates/beta_data_7_60.csv', index_col=0)
    y = pd.read_csv('beta_dates/true_labels.csv', index_col=0).values[:,0]
    y = y + 1
    X = encoder(df, 4)
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

    return X_train, X_test, y_train, y_test


# XGBoost Grid Search

In [31]:
X_train, X_test, y_train, y_test = load_data()
# create a pipeline for XGBoost, SVD and GridSearchCV
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('xgb', xgb.XGBClassifier(objective='multi:softmax', random_state=42)),
        ])

# create a parameter grid for the pipeline

param_grid = {
        'svd__n_components': [3,5],
        'xgb__n_estimators': [80, 110],  # Reduced number of trees
        'xgb__learning_rate': [0.01, 0.1, 1],  # Expanded range with a lower bound
        'xgb__max_depth': [3,4],  # Shallower trees considering fewer components
        'xgb__min_child_weight': [2, 4],  # Adjusted values for instance weight
        'xgb__gamma': [0.3, 0.5],  # Slightly expanded range for loss reduction
        'xgb__subsample': [0.5, 0.8],  # Adjusted subsample ratio
        'xgb__colsample_bytree': [0.8, 1],  # Adjusted subsample ratio of columns
        'xgb__reg_alpha': [1,3],  # Adjusted L1 regularization term
        'xgb__reg_lambda': [2,3],  # Adjusted L2 regularization term
        }

# perform grid search

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

print(grid_search.best_params_)
print(grid_search.best_score_)

print(grid_search.cv_results_)

Fitting 3 folds for each of 1536 candidates, totalling 4608 fits
              precision    recall  f1-score   support

         0.0       0.50      0.25      0.33         8
         1.0       0.83      0.80      0.81        54
         2.0       0.31      0.57      0.40         7

    accuracy                           0.71        69
   macro avg       0.54      0.54      0.51        69
weighted avg       0.74      0.71      0.71        69

{'svd__n_components': 3, 'xgb__colsample_bytree': 1, 'xgb__gamma': 0.3, 'xgb__learning_rate': 1, 'xgb__max_depth': 4, 'xgb__min_child_weight': 2, 'xgb__n_estimators': 110, 'xgb__reg_alpha': 3, 'xgb__reg_lambda': 3, 'xgb__subsample': 0.8}
0.7854504120488776
{'mean_fit_time': array([0.01676695, 0.01604732, 0.01456571, ..., 0.01806768, 0.01958394,
       0.01933503]), 'std_fit_time': array([0.00065626, 0.00079285, 0.00180926, ..., 0.0042155 , 0.00059534,
       0.00401533]), 'mean_score_time': array([0.00106041, 0.00077756, 0.00067568, ..., 0.00060193

# XGBoost Random Search

In [24]:
X_train, X_test, y_train, y_test = load_data()
# create a pipeline for XGBoost, SVD, and RandomizedSearchCV
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('xgb', xgb.XGBClassifier(objective='multi:softmax', random_state=42)),
        ])

# create a parameter grid for the pipeline
param_grid = {
        'svd__n_components': [3,4,5],
        'xgb__gamma': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4, 200],
        'xgb__learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
        'xgb__max_depth': [5,6,7,8,9,10,11,12,13,14],
        'xgb__n_estimators': [50,65,80,100,115,130,150],
        'xgb__reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
        'xgb__reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
        'xgb__subsample': [0.5,0.6,0.7,0.8,0.9,1],
        'xgb__colsample_bytree': [0.5,0.6,0.7,0.8,0.9,1],
        'xgb__min_child_weight': [0.5,1,3,5,7,9],
              }

# perform random search

random_search = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1, n_iter=1000)
random_search.fit(X_train, y_train)

# train model with best parameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

print(random_search.best_params_)
print(random_search.best_score_)
print(random_search.cv_results_)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits
              precision    recall  f1-score   support

         0.0       0.25      0.12      0.17         8
         1.0       0.74      0.89      0.81        47
         2.0       0.75      0.43      0.55        14

    accuracy                           0.71        69
   macro avg       0.58      0.48      0.51        69
weighted avg       0.68      0.71      0.68        69

{'xgb__subsample': 0.6, 'xgb__reg_lambda': 0.4, 'xgb__reg_alpha': 0, 'xgb__n_estimators': 50, 'xgb__min_child_weight': 3, 'xgb__max_depth': 8, 'xgb__learning_rate': 0.300000012, 'xgb__gamma': 0.8, 'xgb__colsample_bytree': 0.5, 'svd__n_components': 5}
0.7904944586530265
{'mean_fit_time': array([0.01006818, 0.0228014 , 0.01074028, 0.01033274, 0.01736236,
       0.01348162, 0.01605439, 0.00964769, 0.00995731, 0.00937192,
       0.00870148, 0.01336567, 0.01582678, 0.01216706, 0.02707386,
       0.01923537, 0.02848355, 0.01631133, 0.01353574, 0.03025866

# XGBoost Bayesian Optimization

In [39]:
X_train, X_test, y_train, y_test = load_data()

# Create a pipeline
pipe = Pipeline([
    ('svd', TruncatedSVD(n_components=5)),
    ('xgb', xgb.XGBClassifier(objective='multi:softmax', random_state=42)),
])

# Define the evaluation function
def xgb_evaluate(gamma, learning_rate, max_depth, n_estimators, reg_alpha, reg_lambda, subsample, colsample_bytree, min_child_weight):
    params = {
        'xgb__gamma': gamma,
        'xgb__learning_rate': learning_rate,
        'xgb__max_depth': int(max_depth),
        'xgb__n_estimators': int(n_estimators),
        'xgb__reg_alpha': reg_alpha,
        'xgb__reg_lambda': reg_lambda,
        'xgb__subsample': subsample,
        'xgb__colsample_bytree': colsample_bytree,
        'xgb__min_child_weight': min_child_weight,
    }
    pipe.set_params(**params)
    pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    return accuracy_score(y_test, predictions)

# Bayesian Optimization
optimizer = BayesianOptimization(
    f=xgb_evaluate,
    pbounds={
        'gamma': (0, 5),
        'learning_rate': (0.01, 0.5),
        'max_depth': (3, 5),
        'n_estimators': (50, 300),
        'reg_alpha': (1, 100),
        'reg_lambda': (1, 100),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0),
        'min_child_weight': (1, 10),
    },
    random_state=42
)

optimizer.maximize(init_points=5, n_iter=200)

# After optimization, retrieve and adjust best parameters for pipeline
best_params = optimizer.max['params']
best_params_for_pipeline = {'xgb__' + key: int(value) if key in ['max_depth', 'n_estimators'] else value for key, value in best_params.items()}

# Set the best parameters in the pipeline
pipe.set_params(**best_params_for_pipeline)
pipe.fit(X_train, y_train)

# Predictions and Evaluation
predictions = pipe.predict(X_test)
print(classification_report(y_test, predictions))

# Print best parameters and performance
print("Best Parameters for Pipeline:", best_params_for_pipeline)
print("Best Performance:", optimizer.max['target'])



|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.7536   [0m | [0m0.6873   [0m | [0m4.754    [0m | [0m0.3687   [0m | [0m4.197    [0m | [0m2.404    [0m | [0m89.0     [0m | [0m6.75     [0m | [0m86.75    [0m | [0m0.8006   [0m |
| [0m2        [0m | [0m0.7536   [0m | [0m0.854    [0m | [0m0.1029   [0m | [0m0.4853   [0m | [0m4.665    [0m | [0m2.911    [0m | [0m95.46    [0m | [0m19.16    [0m | [0m31.12    [0m | [0m0.7624   [0m |
| [0m3        [0m | [0m0.7536   [0m | [0m0.716    [0m | [0m1.456    [0m | [0m0.3098   [0m | [0m3.279    [0m | [0m3.629    [0m | [0m141.6    [0m | [0m46.15    [0m | [0m78.73    [0m | [0m0.5998   [0m |
| [0m4        [0m | [0m0.1014   [0m | [0m0.7571   [0m | [0m

# Build the best model

In [12]:
X_train, X_test, y_train, y_test = load_data()
# create a pipeline for XGBoost, FAMD, Random Forests, and Multiclass logistic regression
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('xgb', xgb.XGBClassifier(objective='multi:softmax', random_state=42)),
        ])

# create a parameter grid for the pipeline

param_grid = {
'svd__n_components': 5,
 'xgb__colsample_bytree': 0.8, 
 'xgb__gamma': 0.5, 
 'xgb__learning_rate': 0.1,
   'xgb__max_depth': 3, 
   'xgb__min_child_weight': 2, 
   'xgb__n_estimators': 200, 
   'xgb__reg_alpha': 1, 
   'xgb__reg_lambda': 2, 
   'xgb__subsample': 0.7}


pipe.fit(X_train, y_train).score(X_test, y_test)

pipe.set_params(**param_grid).fit(X_train, y_train).score(X_test, y_test)

# Predictions
predictions = pipe.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.80      0.44      0.57         9
         1.0       0.83      0.88      0.85        50
         2.0       0.55      0.60      0.57        10

    accuracy                           0.78        69
   macro avg       0.73      0.64      0.67        69
weighted avg       0.78      0.78      0.78        69



# Random Forest Grid search

In [18]:
X_train, X_test, y_train, y_test = load_data()
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('rf', RandomForestClassifier()),
])

param_grid = {
        'svd__n_components': [3,4,5],
        'rf__n_estimators': [50, 100, 200],  # Number of trees in the forest
        'rf__max_depth': [10, 20, 30],  # Maximum depth of each tree
        'rf__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
        'rf__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
        'rf__max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
}

# perform grid search

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

print(grid_search.best_params_)
print(grid_search.best_score_)

print(grid_search.cv_results_)

Fitting 3 folds for each of 486 candidates, totalling 1458 fits
              precision    recall  f1-score   support

         0.0       0.25      0.50      0.33         2
         1.0       0.82      0.87      0.85        54
         2.0       0.50      0.31      0.38        13

    accuracy                           0.75        69
   macro avg       0.52      0.56      0.52        69
weighted avg       0.75      0.75      0.74        69

{'rf__max_depth': 10, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 50, 'svd__n_components': 4}
0.7755754475703324
{'mean_fit_time': array([0.03956795, 0.04751801, 0.03917408, 0.0917093 , 0.10956375,
       0.1008064 , 0.14807359, 0.15691932, 0.1661423 , 0.05882533,
       0.04342675, 0.03912862, 0.06663585, 0.07155434, 0.07748206,
       0.14668433, 0.15844329, 0.17339635, 0.03562593, 0.03707639,
       0.04201571, 0.07453386, 0.09103815, 0.07146708, 0.15757847,
       0.14986269, 0.15080206,

# Random Forest Bayesian Search

In [44]:
X_train, X_test, y_train, y_test = load_data()

# Create a pipeline for Random Forests
pipe = Pipeline([
    ('svd', TruncatedSVD(n_components=5)),
    ('rf', RandomForestClassifier(random_state=42)),
])

# Define the evaluation function
def rf_evaluate(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):
    params = {
        'rf__n_estimators': int(n_estimators),
        'rf__max_depth': int(max_depth),
        'rf__min_samples_split': int(min_samples_split),
        'rf__min_samples_leaf': int(min_samples_leaf),
        'rf__max_features': max_features,
    }
    pipe.set_params(**params)
    pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    return accuracy_score(y_test, predictions)

# Bayesian Optimization
optimizer = BayesianOptimization(
    f=rf_evaluate,
    pbounds={
        'n_estimators': (50, 300),
        'max_depth': (3, 5),
        'min_samples_split': (2, 10),
        'min_samples_leaf': (1, 4),
        'max_features': (0.1,1)
    },
    random_state=42
)

optimizer.maximize(init_points=5, n_iter=200)

# After optimization, retrieve and adjust best parameters for pipeline
best_params = optimizer.max['params']
best_params_for_pipeline = {'rf__' + key: int(value) if key in ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf'] else value for key, value in best_params.items()}

# Set the best parameters in the pipeline
pipe.set_params(**best_params_for_pipeline)
pipe.fit(X_train, y_train)

# Predictions and Evaluation
predictions = pipe.predict(X_test)
print(classification_report(y_test, predictions))

# Print best parameters and performance
print("Best Parameters for Pipeline:", best_params_for_pipeline)
print("Best Performance:", optimizer.max['target'])



|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.7971   [0m | [0m3.749    [0m | [0m0.9556   [0m | [0m3.196    [0m | [0m6.789    [0m | [0m89.0     [0m |
| [0m2        [0m | [0m0.7681   [0m | [0m3.312    [0m | [0m0.1523   [0m | [0m3.599    [0m | [0m6.809    [0m | [0m227.0    [0m |
| [95m3        [0m | [95m0.8116   [0m | [95m3.041    [0m | [95m0.9729   [0m | [95m3.497    [0m | [95m3.699    [0m | [95m95.46    [0m |
| [0m4        [0m | [0m0.7826   [0m | [0m3.367    [0m | [0m0.3738   [0m | [0m2.574    [0m | [0m5.456    [0m | [0m122.8    [0m |
| [0m5        [0m | [0m0.7536   [0m | [0m4.224    [0m | [0m0.2255   [0m | [0m1.876    [0m | [0m4.931    [0m | [0m164.0    [0m |
| [0m6        [0m | [0m0.7826   [0m | [0m3.0      [0m | [0m0.1      [0m | [0m1.0      [0m | [0m10.0 

# Logistic Regression Grid Search

In [17]:
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('logreg', LogisticRegression()),
])

param_grid = {
        'svd__n_components': [3,4,5],
        'logreg__C': [0.1, 1, 10, 100],
        'logreg__penalty': ['l1', 'l2'],
        'logreg__multi_class': ['ovr', 'multinomial'],
        'logreg__solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
}

# perform grid search

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

print(grid_search.best_params_)
print(grid_search.best_score_)

print(grid_search.cv_results_)


Fitting 3 folds for each of 192 candidates, totalling 576 fits




              precision    recall  f1-score   support

         0.0       1.00      0.25      0.40         8
         1.0       0.68      0.91      0.78        43
         2.0       0.60      0.33      0.43        18

    accuracy                           0.68        69
   macro avg       0.76      0.50      0.54        69
weighted avg       0.70      0.68      0.64        69

{'logreg__C': 10, 'logreg__multi_class': 'multinomial', 'logreg__penalty': 'l1', 'logreg__solver': 'saga', 'svd__n_components': 5}
0.7704603580562659
{'mean_fit_time': array([0.00138402, 0.00125567, 0.00140532, 0.00119885, 0.00123   ,
       0.00119297, 0.00097648, 0.00090496, 0.00098578, 0.00425299,
       0.00408729, 0.00398207, 0.00584769, 0.00514436, 0.00724037,
       0.00289194, 0.00271336, 0.00271535, 0.00431999, 0.00277034,
       0.00312122, 0.00556231, 0.00468095, 0.00346398, 0.00060034,
       0.0010097 , 0.00067735, 0.00060829, 0.00102957, 0.00073902,
       0.00066996, 0.00061591, 0.00066201, 0.0025

216 fits failed out of a total of 576.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator

# Bayesian Optimization

In [52]:
# Assume load_data function and dataset are defined
X_train, X_test, y_train, y_test = load_data()

# Create a pipeline for Logistic Regression
pipe = Pipeline([
    ('svd', TruncatedSVD(n_components=5)),
    ('logreg', LogisticRegression(max_iter=1000)),  # Increased max_iter for convergence
])

penalties = ['l2']
multi_classes = ['ovr', 'multinomial']
solvers = ['newton-cg', 'lbfgs', 'sag', 'saga']

# Define the evaluation function
def logreg_evaluate(C, penalty, multi_class, solver):
    penalty = penalties[int(penalty)]
    multi_class = multi_classes[int(multi_class)]
    solver = solvers[int(solver)]

    params = {
        'logreg__C': C,
        'logreg__penalty': penalty,
        'logreg__multi_class': multi_class,
        'logreg__solver': solver,
    }
    pipe.set_params(**params)
    pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    return accuracy_score(y_test, predictions)

# Bayesian Optimization
optimizer = BayesianOptimization(
    f=logreg_evaluate,
    pbounds={
        'C': (0.1, 100),
        'penalty': (0, len(penalties) - 1),  # Ensuring valid index range
        'multi_class': (0, len(multi_classes) - 1),  # Ensuring valid index range
        'solver': (0, len(solvers) - 1),  # Ensuring valid index range
    },
    random_state=42
)

optimizer.maximize(init_points=5, n_iter=200)

# After optimization, retrieve and adjust best parameters for pipeline
best_params = optimizer.max['params']
best_params_for_pipeline = {
    'logreg__C': best_params['C'],
    'logreg__penalty': penalties[int(best_params['penalty'])],
    'logreg__multi_class': multi_classes[int(best_params['multi_class'])],
    'logreg__solver': solvers[int(best_params['solver'])],
}

# Set the best parameters in the pipeline
pipe.set_params(**best_params_for_pipeline)
pipe.fit(X_train, y_train)

# Predictions and Evaluation
predictions = pipe.predict(X_test)
print(classification_report(y_test, predictions))

# Print best parameters and performance
print("Best Parameters for Pipeline:", best_params_for_pipeline)
print("Best Performance:", optimizer.max['target'])


|   iter    |  target   |     C     | multi_... |  penalty  |  solver   |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.7391   [0m | [0m37.52    [0m | [0m0.9507   [0m | [0m0.0      [0m | [0m1.796    [0m |
| [0m2        [0m | [0m0.7391   [0m | [0m15.69    [0m | [0m0.156    [0m | [0m0.0      [0m | [0m2.599    [0m |
| [0m3        [0m | [0m0.7391   [0m | [0m60.15    [0m | [0m0.7081   [0m | [0m0.0      [0m | [0m2.91     [0m |
| [0m4        [0m | [0m0.7391   [0m | [0m83.26    [0m | [0m0.2123   [0m | [0m0.0      [0m | [0m0.5502   [0m |
| [0m5        [0m | [0m0.7391   [0m | [0m30.49    [0m | [0m0.5248   [0m | [0m0.0      [0m | [0m0.8737   [0m |
| [0m6        [0m | [0m0.7391   [0m | [0m99.98    [0m | [0m0.1644   [0m | [0m0.0      [0m | [0m2.847    [0m |
| [0m7        [0m | [0m0.7391   [0m | [0m5.017    [0m | [0m0.01848  [0m | [0m0.0      [0m | [0m2.171    [0m 

# Gaussian Discriminant Analysis Grid Search

In [12]:
X_train, X_test, y_train, y_test = load_data()

pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('gdareg', GaussianNB()),
])

param_grid = {
        'svd__n_components': [3,4,5],
        'gdareg__priors': [[0.05,0.9,0.05],[0.1,0.8,0.1],[0.15,0.7,0.15],[0.2,0.6,0.2]]
}

# perform grid search

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

print(grid_search.best_params_)
print(grid_search.best_score_)

print(grid_search.cv_results_)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         9
         1.0       0.65      1.00      0.79        44
         2.0       1.00      0.06      0.12        16

    accuracy                           0.65        69
   macro avg       0.55      0.35      0.30        69
weighted avg       0.64      0.65      0.53        69

{'gdareg__priors': [0.05, 0.9, 0.05], 'svd__n_components': 5}
0.721938050582552
{'mean_fit_time': array([0.09349036, 0.05310734, 0.02721842, 0.00428843, 0.00660189,
       0.00937184, 0.00425776, 0.00373745, 0.00528232, 0.0040532 ,
       0.00427826, 0.00580279]), 'std_fit_time': array([0.049119  , 0.06027522, 0.02243783, 0.00119649, 0.00539769,
       0.00622216, 0.00102889, 0.00050375, 0.00178973, 0.0005825 ,
       0.00070931, 0.00313598]), 'mean_score_time': array([0.0048945 , 0.00523694, 0.00170978, 0.00167855, 0.00120552,
       0.00213345, 0.001

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Gaussian Discriminant Analysis Bayesian Optimization

In [16]:
# Assume load_data function and dataset are defined
X_train, X_test, y_train, y_test = load_data()

# Create a pipeline for Logistic Regression
pipe = Pipeline([
    ('svd', TruncatedSVD(n_components=5)),
    ('gdareg', GaussianNB()),  # Increased max_iter for convergence
])

priors = [[0.05,0.9,0.05],[0.1,0.8,0.1],[0.15,0.7,0.15],[0.2,0.6,0.2]]

# Define the evaluation function
def gdareg_evaluate(prior):
    prior = priors[int(prior)]

    params = {
        'gdareg__priors': prior
    }
    pipe.set_params(**params)
    pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    return accuracy_score(y_test, predictions)

# Bayesian Optimization
optimizer = BayesianOptimization(
    f=gdareg_evaluate,
    pbounds={
        'prior': (0, len(priors) - 1),  # Ensuring valid index range
    },
    random_state=42
)

optimizer.maximize(init_points=5, n_iter=200)

# After optimization, retrieve and adjust best parameters for pipeline
best_params = optimizer.max['params']
best_params_for_pipeline = {
    'gdareg__priors': priors[int(best_params['prior'])],
}

# Set the best parameters in the pipeline
pipe.set_params(**best_params_for_pipeline)
pipe.fit(X_train, y_train)

# Predictions and Evaluation
predictions = pipe.predict(X_test)
print(classification_report(y_test, predictions))

# Print best parameters and performance
print("Best Parameters for Pipeline:", best_params_for_pipeline)
print("Best Performance:", optimizer.max['target'])


|   iter    |  target   |   prior   |
-------------------------------------
| [0m1        [0m | [0m0.7101   [0m | [0m1.124    [0m |
| [0m2        [0m | [0m0.6522   [0m | [0m2.852    [0m |
| [0m3        [0m | [0m0.6522   [0m | [0m2.196    [0m |
| [0m4        [0m | [0m0.7101   [0m | [0m1.796    [0m |
| [95m5        [0m | [95m0.7826   [0m | [95m0.4681   [0m |
| [0m6        [0m | [0m0.7101   [0m | [0m1.124    [0m |
| [0m7        [0m | [0m0.7826   [0m | [0m0.09951  [0m |
| [0m8        [0m | [0m0.7826   [0m | [0m0.284    [0m |
| [0m9        [0m | [0m0.7826   [0m | [0m1.661e-05[0m |
| [0m10       [0m | [0m0.7826   [0m | [0m0.6284   [0m |
| [0m11       [0m | [0m0.7826   [0m | [0m0.5579   [0m |
| [0m12       [0m | [0m0.7826   [0m | [0m0.3703   [0m |
| [0m13       [0m | [0m0.7826   [0m | [0m0.1879   [0m |
| [0m14       [0m | [0m0.7826   [0m | [0m0.04225  [0m |
| [0m15       [0m | [0m0.7826   [0m | [0m0.6878 