In [3]:
import numpy as np
import pandas as pd
from skmultiflow.data.data_stream import DataStream
from NCPR_functions import load_NCPR, train_test_split

from adaptive_xgboost import AdaptiveXGBoostClassifier
from river import metrics, preprocessing, stream, linear_model, tree, ensemble, compat
from sklearn import datasets
from sklearn.model_selection import ParameterGrid
import sklearn
import model_to_river
import my_pipeline
from ensemble_class import EnsembleModel
from tqdm import tqdm
from joblib import Parallel, delayed, parallel_backend

In [None]:
def train_model(stream, model):
    for data, target in stream:
        x = model.transform_one(data)
        model = model.learn_one(x, target)
    return model 

In [None]:
def test_model(stream, model):
    for data, target in stream:
        y_pred = model.predict_one(data)
        metric = metric.update(target, y_pred)
    return metric.get()

In [None]:
def river_pipeline(X, y, classifier):  
    model = my_pipeline.Pipeline(
        classifier
    )
    metric = metrics.Accuracy()


    for i in range(len(X)-1):
        train_stream = stream.iter_array(
            X[i], y[i],
            feature_names = ['x{}'.format(j) for j in range(len(X[i]))] 
        )
        
        model = train_model(train_stream, model)

    test_stream = stream.iter_array(
        X[-1], y[-1],
        feature_names = ['x{}'.format(j) for j in range(len(X[-1]))] 
    )
    for data, target in test_stream:
        y_pred = model.predict_one(data)      # make a prediction
        metric = metric.update(target, y_pred)
    return test_model(test_stream, model)

In [None]:
def try_params(params, model_name, data, labels):
    xtrain, ytrain, xtest, ytest = train_test_split(data, labels)
    classes = np.unique(ytrain)

    if model_name = "SGD":
        model = compat.convert_sklearn_to_river(sklearn.linear_model.SGDClassifier(**params), classes=classes)
    elif model_name = "Decision Tree":
        model = tree.ExtremelyFastDecisionTreeClassifier(**params)
    elif model_name = "Random Forest":
        model = ensemble.AdaptiveRandomForestClassifier(**params)
    elif model_name = "Hoeffding Tree":
        model = tree.HoeffdingAdaptiveTreeClassifier(**params)

    X = [xtrain, xtest]
    y = [ytrain, ytest]
    return river_pipeline(X,y, model)

In [None]:
def get_hyperparams(param_grid, model_name, data, labels, csv_fname, n_jobs, return_best_param=False):
    param_grid = ParameterGrid(param_grid)
    results = Parallel(n_jobs=n_jobs)(delayed(try_params)(params, model_name, data, labels) for params in tqdm(param_grid))

    column_names = list(results[0][0].keys())
    column_names.append('Score')
    rows = []
    scores = []
    for i in range(len(results)):
        row = list(results[i][0].values())
        row.append(results[i][1])
        rows.append(row)
        scores.append(results[i][1])
    df = pd.DataFrame(rows, columns=column_names)
    df.to_csv(csv_fname, header=True)

    if return_best_param:
        best_index = scores.index(max(scores))
        best_params = results[best_index][0]
        return best_params
    else:
        pass

In [4]:
dict_data, NCPR_df = load_NCPR('data/NCPR_bert.npz', 'data/uniprot-NCPR.tab', 'data/uniprot-NCPR.fasta')
xtrain, ytrain, xtest, ytest = train_test_split(dict_data, NCPR_df)

In [None]:
classes = list(np.unique(ytrain))

sgd_model = compat.convert_sklearn_to_river(sklearn.linear_model.SGDClassifier(loss='log', eta0= 0.1,learning_rate = 'constant'), classes=classes)
tree_model = tree.ExtremelyFastDecisionTreeClassifier(
    grace_period=100,
    split_confidence=1e-5,
    min_samples_reevaluate=100)
forest_model = ensemble.AdaptiveRandomForestClassifier(n_models=10)
hoeffding_model = tree.HoeffdingAdaptiveTreeClassifier(
    grace_period=100,
    split_confidence=1e-5,
    leaf_prediction='nb',
    nb_threshold=10,
    seed=0
    )
#xgb_model = model_to_river.Multiflow2RiverClassifier(AdaptiveXGBoostClassifier(update_strategy='push'), classes=[0,1,2])

In [None]:
X = [xtrain, xtest]
y = [ytrain, ytest]

In [None]:
sgd_results = Parallel(n_jobs=5)(delayed(river_pipeline)(X,y,sgd_model) for i in tqdm(50))
sgd_score = np.nanmean(sgd_results)
with open('sgd_results.txt', 'w') as f:
    f.write('average score: ' + str(sgd_score) + '\n')
    for i in sgd_results:
        f.write(str(i) + '\n')

In [None]:
tree_results = Parallel(n_jobs=5)(delayed(river_pipeline)(X,y,tree_model) for i in tqdm(50))
tree_score = np.nanmean(tree_results)
with open('tree_results.txt', 'w') as f:
    f.write('average score: ' + str(tree_score) + '\n')
    for i in tree_results:
        f.write(str(i) + '\n')

In [None]:
forest_results = Parallel(n_jobs=5)(delayed(river_pipeline)(X,y,forest_model) for i in tqdm(50))
forest_score = np.nanmean(forest_results)
with open('forest_results.txt', 'w') as f:
    f.write('average score: ' + str(forest_score) + '\n')
    for i in forest_results:
        f.write(str(i) + '\n')

In [None]:
hoeffding_results = Parallel(n_jobs=5)(delayed(river_pipeline)(X,y,forest_model) for i in tqdm(50))
hoeffding_score = np.nanmean(hoeffding_results)
with open('hoeffding_results.txt', 'w') as f:
    f.write('average score: ' + str(hoeffding_score) + '\n')
    for i in hoeffding_results:
        f.write(str(i) + '\n')

In [None]:
ensemble_model = EnsembleModel(models=[sgd_model,tree_model,forest_model,hoeffding_model], classes=classes, weights = [sgd_score, tree_score, forest_score, hoeffding_Score])
ensemble_results = Parallel(n_jobs=5)(delayed(river_pipeline)(X,y,ensemble_model) for i in tqdm(50))
ensemble_score = np.nanmean(hoeffding_results)
with open('ensemble_results.txt', 'w') as f:
    f.write('average score: ' + str(ensemble_score) + '\n')
    for i in ensemble_results:
        f.write(str(i) + '\n')