In [None]:
# in order to use lccv package, dont forget to execute:
# export PYTHONPATH=/home/janvanrijn/projects/lccv (adapt to local directory)

import lccv
import sklearn.datasets
import sklearn.linear_model
import sklearn.tree

In [None]:
openmlid = 41161
X, y = sklearn.datasets.fetch_openml(data_id=openmlid, return_X_y=True)

# Running without Timeout

In [None]:
%%time
lccv.lccv(sklearn.linear_model.LogisticRegression(), X, y)

# Running With Timeout

In [None]:
%%time
lccv.lccv(sklearn.linear_model.LogisticRegression(), X, y, timeout=5)

# Running with Threshold

In [None]:
%%time
lccv.lccv(sklearn.tree.DecisionTreeClassifier(), X, y, r = 1.0, enforce_all_anchor_evaluations=True, timeout=500, verbose=True)

# Making Recommendations on whether or not collect more data

In [None]:
from lccv import lccv
from evalutils import *

def check_possible_improvement(learner, X, y):
    n = X.shape[0]
    target = int(0.9 * n)
    half = int(target/2)
    print("Conducting validation on " + str(half) + "/" + str(target) + " instances.")
    validation_score, mean_estimates, estimates, elm = lccv.lccv(learner(), X, y, r = 1.0, enforce_all_anchor_evaluations=True, target = half, timeout=500, verbose=False)
    
    prediction_on_full = elm.get_ipl_estimate_at_target(target)
    expected_improvement = validation_score - prediction_on_full
    print("Score at half size (" + str(half) + "):", validation_score)
    print("Estimated score on full data (" + str(target) + " instances):", prediction_on_full)
    print("This would be an improvement of:", expected_improvement)
    recommendation = expected_improvement >= 0.01
    print("Recommending to double the number of instances:", recommendation)
    
    # now computing the true performance there
    print("Now computing the TRUE performance at the target.")
    for seed in range(10):
        elm.compute_and_add_sample(target, seed=seed)
    true_performance_at_target = np.mean(elm.get_values_at_anchor(target))
    true_improvement_at_target = validation_score - true_performance_at_target
    print("True performance at target:", true_performance_at_target)
    print("The true improvement is:", true_improvement_at_target)
    print("Recommendation was " + ("good" if (recommendation and true_improvement_at_target >= 0.01 or not recommendation and true_improvement_at_target < 0.01) else "bad"))

    
learner = sklearn.tree.DecisionTreeClassifier
print("Conducting recommendation analysis now only for " + learner.__name__)

print("\n----------------- 1 -----------------")
print("Doing this on higgs. Here we *will* recognize saturation and the recommendation for more data will be good.")
openmlid = 23512
X, y = get_dataset(openmlid)
check_possible_improvement(learner, X, y)

print("\n----------------- 2 -----------------")
print("Doing this on Madelon will be overly optimistic, but the recommendation for more data would still be correct.")
openmlid = 1485
X, y = get_dataset(openmlid)
check_possible_improvement(learner, X, y)

print("\n----------------- 3 -----------------")
print("We are also good in recognizing it on wine-quality-white!")
openmlid = 40498
X, y = get_dataset(openmlid)
check_possible_improvement(learner, X, y)

print("\n----------------- 4 -----------------")
print("On amazon, it works also well!")
openmlid = 1457
X, y = get_dataset(openmlid)
check_possible_improvement(learner, X, y)