In [97]:
from river import datasets

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier

from skactiveml.stream import StreamRandomSampling, PeriodicSampling

In [96]:
init_train_ratio = 0.1
stream_ratio = 0.9

In [99]:
datasets_dict = {
    'Elect2': {'path': '../data/electricity.csv',
               'header':'infer',
               'y':'class',
               'X':['date', 'day', 'period', 'nswprice', 'nswdemand', 'vicprice', 'vicdemand', 'transfer']
               },
     'iris': {'path': '../data/iris.csv',
              'header':None,
              'y': 3,
              'X': [0, 1, 2]}
}

models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter = 10000),
        'params': {
            'solver': ['lbfgs'],
            'penalty': ['l2'],
            'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(),
        'params': {
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 300, 500],
        'gamma': [0, 0.1, 0.2],
        'reg_lambda': [0, 1, 10],
        'reg_alpha': [0, 1, 10]
        }
    }
}

query_strategies = {
    'StreamRandomSampling': StreamRandomSampling(),
    'PeriodicSampler': PeriodicSampling(),
}

In [100]:
XGBClassifier().get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)>

In [120]:
for dataset in datasets_dict.keys():
    print(f'Iniciating {dataset}')
    df = pd.read_csv(
        datasets_dict[dataset]['path'],
        header =  datasets_dict[dataset]['header']
        )

    X = df[datasets_dict[dataset]['X']]

    y = df[datasets_dict[dataset]['y']]

    X_init_train = X.iloc[:round(X.shape[0]*init_train_ratio)]
    y_init_train = y.iloc[:round(y.shape[0]*init_train_ratio)]

    X_stream = X.iloc[round(X.shape[0]*init_train_ratio):round(X.shape[0]*stream_ratio)]
    y_stream = y.iloc[round(y.shape[0]*init_train_ratio):round(y.shape[0]*stream_ratio)]

    X_final_val = X.iloc[round(X.shape[0]*stream_ratio):]
    y_final_val = y.iloc[round(y.shape[0]*stream_ratio):]

    best_models = {}


    for model_name, model_config in models.items():
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(model_config['model'], model_config['params'], cv=2, scoring='accuracy', n_jobs=-1, error_score = 'raise', refit=True)
        grid_search.fit(X_init_train, y_init_train)

        best_models[model_name] = {
            'best_estimator': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_
        }

        pretrained_model = grid_search.best_estimator_



        print(f"Best params for {model_name} in {dataset} pretrain: {grid_search.best_params_}")
        print(f"Best accuracy (cross validation): {grid_search.best_score_:.4f}")

    print(best_models)

Iniciating Elect2
Training LogisticRegression...
Best params for LogisticRegression in Elect2 pretrain: {'C': 5, 'penalty': 'l2', 'solver': 'lbfgs'}
Best accuracy (cross validation): 0.7888
Training XGBoost...
Best params for XGBoost in Elect2 pretrain: {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 10, 'reg_lambda': 0}
Best accuracy (cross validation): 0.8159
{'LogisticRegression': {'best_estimator': LogisticRegression(C=5, max_iter=10000), 'best_params': {'C': 5, 'penalty': 'l2', 'solver': 'lbfgs'}, 'best_score': np.float64(0.7887853653879502)}, 'XGBoost': {'best_estimator': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, grow_policy=None, importance_type=None,
              interaction_constraint

In [86]:
df = pd.read_csv('../data/iris.csv', header = None)
# df = df.drop(columns=[0,1])
df

Unnamed: 0,0,1,2,3
0,3.5,1.3,0.3,0
1,3.0,5.5,2.1,2
2,3.3,4.7,1.6,1
3,3.6,1.0,0.2,0
4,3.1,1.5,0.2,0
...,...,...,...,...
145,2.7,5.3,1.9,2
146,2.7,5.1,1.9,2
147,3.7,1.5,0.2,0
148,2.9,1.4,0.2,0


In [118]:
df = pd.read_csv('../data/electricity.csv')
df

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,0.0000,2,0.000000,0.056443,0.439155,0.003467,0.422915,0.414912,1
1,0.0000,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,1
2,0.0000,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,1
3,0.0000,2,0.063830,0.045485,0.314639,0.003467,0.422915,0.414912,1
4,0.0000,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,0
...,...,...,...,...,...,...,...,...,...
45307,0.9158,7,0.914894,0.044224,0.340672,0.003033,0.255049,0.405263,0
45308,0.9158,7,0.936170,0.044884,0.355549,0.003072,0.241326,0.420614,0
45309,0.9158,7,0.957447,0.043593,0.340970,0.002983,0.247799,0.362281,0
45310,0.9158,7,0.978723,0.066651,0.329366,0.004630,0.345417,0.206579,1


In [None]:
# le = LabelEncoder()
# le.fit(df['class'])

In [None]:
# df['class'] = le.transform(df['class'])
# df

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,0.0000,2,0.000000,0.056443,0.439155,0.003467,0.422915,0.414912,1
1,0.0000,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,1
2,0.0000,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,1
3,0.0000,2,0.063830,0.045485,0.314639,0.003467,0.422915,0.414912,1
4,0.0000,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,0
...,...,...,...,...,...,...,...,...,...
45307,0.9158,7,0.914894,0.044224,0.340672,0.003033,0.255049,0.405263,0
45308,0.9158,7,0.936170,0.044884,0.355549,0.003072,0.241326,0.420614,0
45309,0.9158,7,0.957447,0.043593,0.340970,0.002983,0.247799,0.362281,0
45310,0.9158,7,0.978723,0.066651,0.329366,0.004630,0.345417,0.206579,1


In [None]:
# df

Unnamed: 0.1,Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,0,0.0000,2,0.000000,0.056443,0.439155,0.003467,0.422915,0.414912,1
1,1,0.0000,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,1
2,2,0.0000,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,1
3,3,0.0000,2,0.063830,0.045485,0.314639,0.003467,0.422915,0.414912,1
4,4,0.0000,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,0
...,...,...,...,...,...,...,...,...,...,...
45307,45307,0.9158,7,0.914894,0.044224,0.340672,0.003033,0.255049,0.405263,0
45308,45308,0.9158,7,0.936170,0.044884,0.355549,0.003072,0.241326,0.420614,0
45309,45309,0.9158,7,0.957447,0.043593,0.340970,0.002983,0.247799,0.362281,0
45310,45310,0.9158,7,0.978723,0.066651,0.329366,0.004630,0.345417,0.206579,1


In [None]:
# df = df.drop(columns = 'Unnamed: 0')

In [119]:
# df.to_csv('../data/electricity.csv', index = False)

In [None]:
# df.to_csv('../data/iris.csv', header=None, index = False)
