In [2]:
import os

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

from xgboost import XGBClassifier


from skactiveml.classifier import SklearnClassifier
from skactiveml.stream import (
    FixedUncertainty,
    VariableUncertainty,
    Split,
    StreamProbabilisticAL,
    StreamRandomSampling,
    PeriodicSampling,
    RandomVariableUncertainty,
    StreamDensityBasedAL,
    CognitiveDualQueryStrategyRan,
    CognitiveDualQueryStrategyFixUn,
    CognitiveDualQueryStrategyRanVarUn,
    CognitiveDualQueryStrategyVarUn,
)
from skactiveml.utils import call_func

from datetime import datetime
import pytz

import warnings
warnings.filterwarnings('ignore')

In [3]:
tz_utc_minus_3 = pytz.timezone("America/Sao_Paulo")

In [4]:
init_train_ratio = 0.1
stream_ratio = 0.9

In [5]:
def custom_accuracy(y_true, y_pred):
    return np.mean(np.array(y_true) == np.array(y_pred))

In [6]:
class WrappedXGBClassifier(XGBClassifier):
    def fit(self, X, y, **kwargs):
        return super().fit(X, y, **kwargs)
    def predict(self,X, **kwargs):
        return super().predict(X, **kwargs)
    def predict_proba(self, X, **kwargs):
        return super().predict_proba(X, **kwargs)
    def score(self, X, y, **kwargs):
        return super().score(X, y, **kwargs)

class WrappedCatBoostClassifier(CatBoostClassifier):
    def fit(self, X, y, **kwargs):
        return super().fit(X, y, **kwargs)

    def predict(self, X, **kwargs):
        return super().predict(X, **kwargs)

    def predict_proba(self, X, **kwargs):
        return super().predict_proba(X, **kwargs)

    def score(self, X, y, **kwargs):
        return super().score(X, y, **kwargs)

In [7]:
datasets_dict = {
    # 'Elect2': {'path': '../data/tests/electricity.csv',
    #            'header':'infer',
    #            'y':'class',
    #            'X':['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer']
    #            },
     'iris': {'path': '../data/tests/iris.csv',
              'header':None,
              'y': 3,
              'X': [0, 1, 2]}
}

models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=10000, random_state=0),
        'params': {
            'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=0),
        'params': {
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.2],
            'n_estimators': [100, 300, 500],
            'gamma': [0, 0.1, 0.2],
            'reg_lambda': [0, 1, 10],
            'reg_alpha': [0, 1, 10]
        }
    },
    'SVM': {
        'model': SVC(kernel = 'rbf', probability = True, random_state=0),
        'params': {
            'C': [0.1, 1, 10, 100],
            'gamma': ['scale', 'auto']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(random_state=0),
        'params': {
            'max_depth': [None, 5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=0),
        'params': {
            'n_estimators': [100, 200, 500],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5]
        }
    },
    'CatBoost': {
        'model': CatBoostClassifier(random_state=0, allow_writing_files=False, thread_count = -1, verbose = 0),
        'params': {
            'depth': [4, 6, 10],
            'learning_rate': [0.01, 0.1, 0.2],
            'iterations': [100, 500, 1000]
        }
    }
}

budgets = [0.05, 0.1, 0.2, 0.5, 1.0]

In [8]:
results_path = '../data/results/test'

In [15]:
results_df = pd.DataFrame()

for dataset in datasets_dict.keys():
    print(f'Dataset: {dataset}')
    df = pd.read_csv(
        datasets_dict[dataset]['path'],
        header =  datasets_dict[dataset]['header']
        )

    X = df[datasets_dict[dataset]['X']]

    y = df[datasets_dict[dataset]['y']]

    X_init_train = X.iloc[:round(X.shape[0]*init_train_ratio)]
    y_init_train = y.iloc[:round(y.shape[0]*init_train_ratio)]

    X_stream = X.iloc[round(X.shape[0]*init_train_ratio):round(X.shape[0]*stream_ratio)]
    y_stream = y.iloc[round(y.shape[0]*init_train_ratio):round(y.shape[0]*stream_ratio)]

    X_stream = X_stream.to_numpy()
    y_stream = y_stream.to_numpy()

    X_final_val = X.iloc[round(X.shape[0]*stream_ratio):]
    y_final_val = y.iloc[round(y.shape[0]*stream_ratio):]

    for model_name, model_config in models.items():
        print(f'\tModel: {model_name} - {datetime.now(tz_utc_minus_3).strftime("%H:%M:%S")}')

        grid_search = GridSearchCV(model_config['model'], model_config['params'], cv=5, scoring='accuracy', n_jobs=-1, error_score = 'raise', refit=True)
        grid_search.fit(X_init_train, y_init_train)

        X_final_train = X_init_train
        y_final_train = y_init_train

        if model_name == 'XGBoost':
            pretrained_model = WrappedXGBClassifier(grid_search.best_estimator_)
        elif model_name == 'CatBoost':
            pretrained_model = WrappedCatBoostClassifier(grid_search.best_estimator_)
        else:
            pretrained_model = grid_search.best_estimator_

        clf_factory = lambda: SklearnClassifier(pretrained_model)

        clf_inicial = clf_factory().fit(X_init_train, y_init_train)

        y_pred = clf_inicial.predict(X_final_val)

        initial_accuracy = custom_accuracy(y_final_val, y_pred)

        for budget in budgets:
            print(f'\t\tBudget: {budget} - {datetime.now(tz_utc_minus_3).strftime("%H:%M:%S")}')
            query_strategies = {
                'FixedUncertainty': FixedUncertainty(budget=budget, random_state=0),
                'VariableUncertainty': VariableUncertainty(budget=budget, random_state=0),
                'Split': Split(budget=budget, random_state=0),
                'StreamProbabilisticAL': StreamProbabilisticAL(budget=budget, metric = 'rbf', random_state=0),
                'StreamRandomSampling': StreamRandomSampling(budget=budget, random_state=0),
                'PeriodicSampling': PeriodicSampling(budget=budget, random_state=0),
                'RandomVariableUncertainty': RandomVariableUncertainty(budget=budget, random_state=0),
                'StreamDensityBasedAL': StreamDensityBasedAL(budget=budget, random_state=0),
                'CognitiveDualQueryStrategyRan': CognitiveDualQueryStrategyRan(budget=budget, random_state=0),
                'CognitiveDualQueryStrategyFixUn': CognitiveDualQueryStrategyFixUn(budget=budget, random_state=0),
                'CognitiveDualQueryStrategyRanVarUn': CognitiveDualQueryStrategyRanVarUn(budget=budget, random_state=0),
                'CognitiveDualQueryStrategyVarUn': CognitiveDualQueryStrategyVarUn(budget=budget, random_state=0),
            }

            if budget == 1.0:
                X_final_train = pd.concat([X_init_train, pd.DataFrame(X_stream)], axis = 0, ignore_index=True)
                y_final_train = pd.concat([y_init_train, pd.Series(y_stream)], axis = 0, ignore_index=True)

                clf = clf_factory()

                final_trained_model = clf.fit(X_final_train, y_final_train)

                y_pred = final_trained_model.predict(X_final_val)

                final_accuracy = custom_accuracy(y_final_val, y_pred)

                experiment_result = pd.DataFrame(data = {
                        'dataset': [dataset],
                        'model_name': [model_name],
                        'model_params': [str(grid_search.best_params_)],
                        'initial_score': [initial_accuracy],
                        'query_strategy': 'NONE: BUDGET = 1',
                        'budget': [budget],
                        'percentage_queried': [(X_final_train.shape[0] - X_init_train.shape[0])/(X_stream.shape[0])],
                        'final_accuracy': [final_accuracy]
                    })

                results_df = pd.concat([results_df, experiment_result], axis = 0)
                results_df.to_csv(f'{results_path}/CC18_results_google_colab_4.csv')

            else:
                for query_strategy_name, query_strategy in query_strategies.items():
                    print(f'\t\t\tQuery strategy: {query_strategy_name} - {datetime.now(tz_utc_minus_3).strftime("%H:%M:%S")}')

                    X_final_train = X_init_train.to_numpy()
                    y_final_train = y_init_train.to_numpy()

                    for i in range(len(X_stream)):

                        new_X, new_y = [], []

                        x_cand = X_stream[i].reshape(1,-1)
                        y_cand = y_stream[i].reshape(1,)


                        if query_strategy_name in ('StreamProbabilisticAL'):
                            queried_indices, utilities = query_strategy.query(
                                candidates=x_cand,
                                clf=clf_inicial,
                                X=X_final_train,
                                y=y_final_train,
                                return_utilities=True
                            )
                        else:
                            queried_indices, utilities = call_func(
                                query_strategy.query,
                                candidates=x_cand,
                                clf=clf_inicial,
                                return_utilities=True,
                                fit_clf=False
                            )

                        budget_manager_param_dict = {"utilities": utilities}

                        call_func(query_strategy.update,
                                    candidates = x_cand,
                                    queried_indices = queried_indices,
                                    budget_manager_param_dict=budget_manager_param_dict)

                        if len(queried_indices):
                            new_X.append(x_cand)
                            new_y.append(y_cand)

                        if new_X:
                            X_final_train = np.vstack([X_final_train] + new_X)
                            y_final_train = np.concatenate([y_final_train] + new_y, axis=0)

                    clf = clf_factory()

                    X_final_train = pd.DataFrame(X_final_train, columns=X_init_train.columns)
                    y_final_train = pd.Series(y_final_train.flatten())

                    final_trained_model = clf.fit(X_final_train, y_final_train)

                    y_pred = final_trained_model.predict(X_final_val)

                    final_accuracy = custom_accuracy(y_final_val, y_pred)

                    experiment_result = pd.DataFrame(data = {
                        'dataset': [dataset],
                        'model_name': [model_name],
                        'model_params': [str(grid_search.best_params_)],
                        'initial_score': [initial_accuracy],
                        'query_strategy': [query_strategy_name],
                        'budget': [budget],
                        'percentage_queried': [(X_final_train.shape[0] - X_init_train.shape[0])/(X_stream.shape[0])],
                        'final_accuracy': [final_accuracy]
                    })

                    results_df = pd.concat([results_df, experiment_result], axis = 0)
                    results_df.to_csv(f'{results_path}/CC18_results_google_colab_4.csv')

Dataset: iris
	Model: LogisticRegression - 00:15:38
		Budget: 0.05 - 00:15:38
			Query strategy: FixedUncertainty - 00:15:38
			Query strategy: VariableUncertainty - 00:15:38
			Query strategy: Split - 00:15:38
			Query strategy: StreamProbabilisticAL - 00:15:38
			Query strategy: StreamRandomSampling - 00:15:38
			Query strategy: PeriodicSampling - 00:15:38
			Query strategy: RandomVariableUncertainty - 00:15:38
			Query strategy: StreamDensityBasedAL - 00:15:38
			Query strategy: CognitiveDualQueryStrategyRan - 00:15:38
			Query strategy: CognitiveDualQueryStrategyFixUn - 00:15:38
			Query strategy: CognitiveDualQueryStrategyRanVarUn - 00:15:39
			Query strategy: CognitiveDualQueryStrategyVarUn - 00:15:39
		Budget: 0.1 - 00:15:39
			Query strategy: FixedUncertainty - 00:15:39
			Query strategy: VariableUncertainty - 00:15:39
			Query strategy: Split - 00:15:39
			Query strategy: StreamProbabilisticAL - 00:15:39
			Query strategy: StreamRandomSampling - 00:15:39
			Query strategy: Per