In [19]:
from river import datasets

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier


from skactiveml.classifier import SklearnClassifier
from skactiveml.stream import StreamRandomSampling, PeriodicSampling, FixedUncertainty
from skactiveml.utils import call_func, MISSING_LABEL

import warnings
warnings.filterwarnings('ignore')

In [20]:
init_train_ratio = 0.1
stream_ratio = 0.9

In [21]:
class WrappedXGBClassifier(XGBClassifier):
    def fit(self, X, y, **kwargs):
        return super().fit(X, y, **kwargs)
    def predict(self,X, **kwargs):
        return super().predict(X, **kwargs)
    def predict_proba(self, X, **kwargs):
        return super().predict_proba(X, **kwargs)
    def score(self, X, y, **kwargs):
        return super().score(X, y, **kwargs)

In [31]:
datasets_dict = {
    'Elect2': {'path': '../data/tests/electricity.csv',
               'header':'infer',
               'y':'class',
               'X':['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer']
               },
     'iris': {'path': '../data/tests/iris.csv',
              'header':None,
              'y': 3,
              'X': [0, 1, 2]}
}

models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter = 10000, random_state = 0),
        'params': {
            'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state = 0),
        'params': {
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 300, 500],
        'gamma': [0, 0.1, 0.2],
        'reg_lambda': [0, 1, 10],
        'reg_alpha': [0, 1, 10]
        }
    }
}

budgets = [0.05, 0.1, 0.2, 0.5, 1.0]

In [33]:
results_df = pd.DataFrame()

for dataset in datasets_dict.keys():
    print(f'Dataset: {dataset}')
    df = pd.read_csv(
        datasets_dict[dataset]['path'],
        header =  datasets_dict[dataset]['header']
        )

    X = df[datasets_dict[dataset]['X']]

    y = df[datasets_dict[dataset]['y']]

    X_init_train = X.iloc[:round(X.shape[0]*init_train_ratio)]
    y_init_train = y.iloc[:round(y.shape[0]*init_train_ratio)]

    X_stream = X.iloc[round(X.shape[0]*init_train_ratio):round(X.shape[0]*stream_ratio)]
    y_stream = y.iloc[round(y.shape[0]*init_train_ratio):round(y.shape[0]*stream_ratio)]

    X_final_val = X.iloc[round(X.shape[0]*stream_ratio):]
    y_final_val = y.iloc[round(y.shape[0]*stream_ratio):]

    for model_name, model_config in models.items():
        print(f"\tModel: {model_name}")
        grid_search = GridSearchCV(model_config['model'], model_config['params'], cv=2, scoring='accuracy', n_jobs=-1, error_score = 'raise', refit=True)
        grid_search.fit(X_init_train, y_init_train)

        for budget in budgets:
            print(f'\t\tBudget: {budget}')
            query_strategies = {
                'FixedUncertainty': FixedUncertainty(budget = budget, random_state = 0),
                'StreamRandomSampling': StreamRandomSampling(budget = budget, random_state = 0),
                'PeriodicSampler': PeriodicSampling(budget = budget, random_state = 0),
            }

            for query_strategy_name, query_strategy in query_strategies.items():
                print(f'\t\t\tQuery strategy: {query_strategy_name}')

                X_final_train = X_init_train
                y_final_train = y_init_train

                if model_name == 'XGBoost':
                    pretrained_model = WrappedXGBClassifier(grid_search.best_estimator_)
                else:
                    pretrained_model = grid_search.best_estimator_

                clf = SklearnClassifier(pretrained_model)
                clf.fit(X_init_train, y_init_train)

                initial_accuracy = clf.score(X_final_val, y_final_val)

                if budget == 1.0:
                    X_final_train = pd.concat([X_final_train, X_stream], axis = 0, ignore_index=True)
                    y_final_train = pd.concat([y_final_train, y_stream], axis = 0, ignore_index=True)


                    final_trained_model = clf.fit(X_final_train, y_final_train)
                    final_accuracy = final_trained_model.score(X_final_val, y_final_val)

                else:
                    for (index, x_s), y_s in zip(X_stream.iterrows(),y_stream):
                        x_cand = x_s.to_frame().T
                        y_cand = y_s

                        queried_indices, utilities = call_func(
                            query_strategy.query,
                            candidates=x_cand,
                            clf=clf,
                            return_utilities=True,
                            fit_clf=False
                        )

                        budget_manager_param_dict = {"utilities": utilities}

                        call_func(query_strategy.update,
                                candidates = x_cand,
                                queried_indices = queried_indices,
                                budget_manager_param_dict=budget_manager_param_dict)

                        if len(queried_indices):
                            X_final_train = pd.concat([X_final_train, x_cand], axis = 0, ignore_index=True)
                            y_final_train = pd.concat([y_final_train, pd.Series(y_s)], axis = 0, ignore_index=True)


                    final_trained_model = clf.fit(X_final_train, y_final_train)
                    final_accuracy = final_trained_model.score(X_final_val, y_final_val)

                experiment_result = pd.DataFrame(data = {
                    'dataset': [dataset],
                    'model_name': [model_name],
                    'model_params': [str(grid_search.best_params_)],
                    'initial_score': [initial_accuracy],
                    'query_strategy': [query_strategy_name],
                    'budget': [budget],
                    'percentage_queried': [(X_final_train.shape[0] - X_init_train.shape[0])/(X_stream.shape[0])],
                    'final_accuracy': [final_accuracy]
                })

                results_df = pd.concat([results_df, experiment_result], axis = 0)
                results_df.to_csv('../data/results/test_results.csv')

Dataset: Elect2
	Model: LogisticRegression
		Budget: 0.05
			Query strategy: FixedUncertainty
			Query strategy: StreamRandomSampling
			Query strategy: PeriodicSampler
		Budget: 0.1
			Query strategy: FixedUncertainty
			Query strategy: StreamRandomSampling
			Query strategy: PeriodicSampler
		Budget: 0.2
			Query strategy: FixedUncertainty
			Query strategy: StreamRandomSampling
			Query strategy: PeriodicSampler
		Budget: 0.5
			Query strategy: FixedUncertainty
			Query strategy: StreamRandomSampling
			Query strategy: PeriodicSampler
		Budget: 1.0
			Query strategy: FixedUncertainty
			Query strategy: StreamRandomSampling
			Query strategy: PeriodicSampler
	Model: XGBoost
		Budget: 0.05
			Query strategy: FixedUncertainty
			Query strategy: StreamRandomSampling
			Query strategy: PeriodicSampler
		Budget: 0.1
			Query strategy: FixedUncertainty
			Query strategy: StreamRandomSampling
			Query strategy: PeriodicSampler
		Budget: 0.2
			Query strategy: FixedUncertainty
			Query strat

In [34]:
results_df

Unnamed: 0,dataset,model_name,model_params,initial_score,query_strategy,budget,percentage_queried,final_accuracy
0,Elect2,LogisticRegression,{'C': 5},0.532995,FixedUncertainty,0.05,0.035145,0.775105
0,Elect2,LogisticRegression,{'C': 5},0.532995,StreamRandomSampling,0.05,0.049545,0.764953
0,Elect2,LogisticRegression,{'C': 5},0.532995,PeriodicSampler,0.05,0.049986,0.706908
0,Elect2,LogisticRegression,{'C': 5},0.532995,FixedUncertainty,0.1,0.072966,0.783712
0,Elect2,LogisticRegression,{'C': 5},0.532995,StreamRandomSampling,0.1,0.099917,0.77356
0,Elect2,LogisticRegression,{'C': 5},0.532995,PeriodicSampler,0.1,0.1,0.759214
0,Elect2,LogisticRegression,{'C': 5},0.532995,FixedUncertainty,0.2,0.157048,0.791216
0,Elect2,LogisticRegression,{'C': 5},0.532995,StreamRandomSampling,0.2,0.199172,0.780622
0,Elect2,LogisticRegression,{'C': 5},0.532995,PeriodicSampler,0.2,0.2,0.780843
0,Elect2,LogisticRegression,{'C': 5},0.532995,FixedUncertainty,0.5,0.444441,0.795851
