In [None]:
! pip install nbimporter
! pip install stree

In [None]:
import nbimporter
import time
import joblib

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from Package import co2_forest

# import your notebook with your datasets

In [None]:
def Run_RF(data, name, param_grid={}):
    """Call Grid Search with RandomForestClassifier: return accuracy score and prints time"""
    x_train, y_train, x_test, y_test = data 
    if param_grid:
        rf = GridSearchCV(RandomForestClassifier(random_state=0, n_jobs=-1), 
                                param_grid, 
                                scoring="accuracy",
                                n_jobs=-1,
                                cv=StratifiedKFold(n_splits=3, random_state=0, shuffle=True))
    else:
        rf = RandomForestClassifier(random_state=0, n_jobs=-1)
    start = time.time()
    rf.fit(x_train, y_train)
    end = time.time()
    y_pred = rf.predict(x_test)
    joblib.dump(rf, name + "_rf.joblib")
    return accuracy_score(y_test, y_pred), end - start

def Run_CO2_forest(data, name, params_co2, param_grid={}):
    """Call Grid Search with CO2_Forest: return accuracy score and time to fit"""
    x_train, y_train, x_test, y_test = data 
    if param_grid:
        co2_f = GridSearchCV(co2_forest.CO2_forest(n_jobs=-1, **params_co2),
                                 param_grid,
                                 scoring="accuracy",
                                 n_jobs=-1,
                                 cv=StratifiedKFold(n_splits=3, random_state=0, shuffle=True))
    else:
        co2_f = co2_forest.CO2_forest(n_jobs=-1, **params_co2)
    start = time.time()
    co2_f.fit(x_train, y_train)
    end = time.time()
    joblib.dump(co2_f, name + "_co2.joblib")
    y_pred = co2_f.predict(x_test)
    return accuracy_score(y_test, y_pred), end - start


def BenchMarks(DatasetName, name, mode="all", *, params_co2={"n_estimators": 10,
                                                             "max_features": "log2",
                                                             "max_depth": 20,
                                                             "min_samples_split":100}, 
               param_grid_rf={
                    'max_depth': [5, 20, 50],
                    'min_samples_leaf': [10, 30, 50, 100],
                    'min_samples_split': [40, 70, 100]
                }, 
               param_grid_co2={}):
    """
    Call rf or co2_forest and return accuracy scores
    mode: could be all, rf, co2
    """
    if mode not in ["all", "rf", "co2"]:
        return None
    data = DatasetName.load().prepare()
    
    result = {"rf": None, "co2": None}
    
    if mode in ["all", "co2"]:
        print("Run co2")
        result["co2"] = Run_CO2_forest(data, name, params_co2, param_grid_co2)
    
    
    if mode in ["all", "rf"]:
        print("Run rf")
        result["rf"] = Run_RF(data, name, param_grid_rf)
    
    del data
    return result

In [None]:
def Test_dataset(name=None, dataSet=None):
    print('-' * 30)
    print(name)
    print()
    result = BenchMarks(dataSet.DatasetName, name, "all")
    print("RandomForest accuracy: ", result["rf"][0],  "  time: ", result["rf"][1])
    print("CO2_Forest accuracy: ", result["co2"][0], "  time: ", result["co2"][1])
    print()
    print('-' * 30)
    print()

In [None]:
# Example: you have notebook: My_nodebook and dataset with som name like weather
# you can call like this: 
# Test_dataset("weather", My_notebook)
# after running in your command panel you will see accuracy_score and fit time.
# also in your repository you will see joblib files with rf and co2 models.

In [None]:
# import Santander

# Test_dataset("Santander", Santander)