In [1]:
from machinelearning.mlpipeline import MLPipelines

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

model = LogisticRegression()
param_grid = {
    'C': [0.001, 0.01, 0.1, 1],
    'penalty': ['l1', 'l2']
}
csv_dir = 'data/composite_dataset.csv'
label = 'group'

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [10, 20, 50],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

mlpipe = MLPipelines(estimator=model, param_grid=param_grid, label=label, csv_dir=csv_dir)
mlpipe.normalize()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mlpipe.create_test_data()

In [None]:
eval = mlpipe.bootsrap(optimizer='random_search', \
    random_iter=3, n_iter=3)

In [3]:
mlpipe.random_search(n_iter=10)

Best parameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 5}
Best accuracy: 0.7742857142857142


In [4]:
mlpipe.best_estimator

In [None]:
class BayesianOptimization(MachineLearningEstimator):
    
    def __init__(self, X_train, y_train, X_test, y_test, 
                 estimator, param_grid, label, csv_dir,
                 scoring='accuracy', direction='maximize'):
        
        super().__init__(estimator, param_grid, label, csv_dir)
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
        self.scoring = scoring
        self.direction = direction

        if self.scoring not in sklearn.metrics.SCORERS.keys():
            raise ValueError(f'Invalid scoring metric: {self.scoring}. Select one of the following: {list(sklearn.metrics.SCORERS.keys())}')

        self.available_clf = {
            'RandomForestClassifier': RandomForestClassifier,
            'KNeighborsClassifier': KNeighborsClassifier,
            'DecisionTreeClassifier': DecisionTreeClassifier,
            'SVC': SVC,
            'GradientBoostingClassifier': GradientBoostingClassifier
        }

        if self.estimator in self.available_clf.values():
            raise ValueError(f'Invalid estimator: {self.estimator}. Select one of the following: {list(self.available_clf.keys())}')
        
        self.bayesian_clfs = {
            'RandomForestClassifier': lambda trial: RandomForestClassifier(
                n_estimators=trial.suggest_int('n_estimators', 2, 200),
                criterion='gini',  # or trial.suggest_categorical('criterion', ['gini', 'entropy'])
                max_depth=trial.suggest_int('max_depth', 1, 50),
                min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
                min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
                bootstrap=trial.suggest_categorical('bootstrap', [True, False]),
                n_jobs=-1,
            ),
            'KNeighborsClassifier': lambda trial: KNeighborsClassifier(
                n_neighbors=trial.suggest_int('n_neighbors', 2, 15),
                weights=trial.suggest_categorical('weights', ['uniform', 'distance']),
                algorithm=trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
                p=trial.suggest_int('p', 1, 2),
                leaf_size=trial.suggest_int('leaf_size', 5, 50),
                n_jobs=-1
            ),
            'DecisionTreeClassifier': lambda trial: DecisionTreeClassifier(
                trial.suggest_categorical('criterion', ['gini', 'entropy']),
                splitter=trial.suggest_categorical('splitter', ['best', 'random']),
                max_depth=trial.suggest_int('max_depth', 1, 100),
                min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
                min_weight_fraction_leaf=trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
            ),
            'SVC': lambda trial: SVC(
                C=trial.suggest_int('C', 1, 10),
                kernel=trial.suggest_categorical('kernel', ['linear', 'rbf', 'sigmoid']),
                probability=trial.suggest_categorical('probability', [True, False]),
                shrinking=trial.suggest_categorical('shrinking', [True, False]),
                decision_function_shape=trial.suggest_categorical('decision_function_shape', ['ovo', 'ovr'])
            ),
            'GradientBoostingClassifier': lambda trial: GradientBoostingClassifier(
                loss=trial.suggest_categorical('loss', ['log_loss', 'exponential']),
                learning_rate=trial.suggest_float('learning_rate', 0.01, 0.5),
                n_estimators=trial.suggest_int('n_estimators', 2, 200),
                criterion=trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error']),
                max_depth=trial.suggest_int('max_depth', 1, 50),
                min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
                min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
            )
    }

    def create_model(self, trial):
        if self.estimator.__class__.__name__ in self.bayesian_clfs.keys():
            model = self.bayesian_clfs[self.estimator](trial)
        else:
            raise ValueError('Classifier not supported')
        return model
    
    def objective(self, trial):
        model = self.create_model(trial=trial)
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        eval_metric = get_scorer(self.scoring)._score_func(self.y_test, y_pred)
        return eval_metric

    def run_optimization(self, n_trials=10):
        self.clf = self.available_clf[self.estimator.__class__.__name__]

        study = optuna.create_study(direction=self.direction)
        study.optimize(self.objective, n_trials=n_trials)

        if self.estimator.__class__.__name__ in self.bayesian_clfs:
            self.model = self.available_clf[self.estimator.__class__.__name__](**study.best_params)
        else:
            raise ValueError('Classifier not supported')

        return study



In [None]:
1/0

ZeroDivisionError: division by zero

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model  import LogisticRegression
from validation import boostrap, nested_cv
from validation.bayesian_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from dataloader import DataLoader

In [None]:
PATH = 'data/composite_dataset.csv'

dataloader = DataLoader(label='group', csv_dir=PATH)
dataloader.normalize(method='standard')
dataloader.feature_selection(n_features=25)
dataloader.encode_categorical()
print(dataloader)

In [None]:
lr = LogisticRegression()
param_grid = {'penalty': ['l1'], 'C': [1, 10, 100, 200, 500], 
            'solver': ['liblinear'], 'max_iter': [10000]}
lr_scores = boostrap(estimator=lr, X=dataloader.X, param_grid=param_grid, y=dataloader.y, scoring='mcc', n_iterations=5) 

In [None]:
clf, nested_scores = nested_cv(lr, param_grid=param_grid, inner_scoring='matthews_corrcoef', outer_scoring='matthews_corrcoef', 
                               X=dataloader.X, y=dataloader.y, num_trials=10)

In [None]:
import matplotlib.pyplot as plt

plt.boxplot(nested_scores)
plt.xlabel('Model')
plt.ylabel('Scores')
plt.title('Nested Cross Validation Scores')
plt.show()



In [None]:
import sklearn
# list(sklearn.metrics.SCORERS.keys())
lr.fit(dataloader.X, dataloader.y)
y_pred = lr.predict(dataloader.X)
y_true = dataloader.y

from sklearn.metrics import get_scorer

# Suppose you have a scorer key, for example, 'matthews_corrcoef'
scorer_key = 'matthews_corrcoef'

# Get the scorer object using the key
scorer = get_scorer(scorer_key)

# Now, if you have true labels y_true and predicted labels y_pred
score = scorer._score_func(y_true, y_pred)  # Replace y_true and y_pred with your actual data

print("Score using the scorer object:", score)
