In [1]:
import warnings

import skrub
from sklearn.ensemble import HistGradientBoostingClassifier
from skrub import TableVectorizer

from sempipes import sem_choose

warnings.filterwarnings("ignore")

dataset = skrub.datasets.fetch_midwest_survey()
X = dataset.X.head(n=500)
y = dataset.y.head(n=500)

responses = skrub.var("response", X)
responses = responses.skb.set_description(dataset.metadata["description"])

labels = skrub.var("labels", y)
labels = labels.skb.set_name(dataset.metadata["target"])

responses = responses.skb.mark_as_X()
labels = labels.skb.mark_as_y()

feature_encoder = TableVectorizer()
encoded_responses = responses.skb.apply_with_sem_choose(
    feature_encoder, choices=sem_choose("tv", high_cardinality="Two encoders appropriate for categorical survey data")
)

learner = HistGradientBoostingClassifier()
model = encoded_responses.skb.apply_with_sem_choose(
    learner,
    y=labels,
    choices=sem_choose(
        "hgb",
        learning_rate="Three learning rates to try",
        min_samples_leaf="A single value appropriate for the rather small data at hand",
    ),
)

--- sempipes.apply_with_sem_choose(TableVectorizer(), SemChoices(name='tv', params_and_prompts={'high_cardinality': 'Two encoders appropriate for categorical survey data'}))
	> Querying 'openai/gpt-4.1' with 2 messages...'
	Suggested choices for high_cardinality: choose_from([StringEncoder(), GapEncoder()], name='__sempipes__...h_cardinality')
--- sempipes.apply_with_sem_choose(HistGradientBoostingClassifier(), SemChoices(name='hgb', params_and_prompts={'learning_rate': 'Three learning rates to try', 'min_samples_leaf': 'A single value appropriate for the rather small data at hand'}))
	> Querying 'openai/gpt-4.1' with 2 messages...'
	Suggested choices for learning_rate: choose_from([0.1, 0.05, 0.2], name='__sempipes__...learning_rate')
	> Querying 'openai/gpt-4.1' with 2 messages...'
	Suggested choices for min_samples_leaf: choose_from([5, 10, 20], name='__sempipes__..._samples_leaf')


In [2]:
from sempipes.optimisers.colopro import optimise_sem_choices

results = optimise_sem_choices(
    model,
    budget=3,
    scoring="accuracy",
    cv=2,
    candidates_to_evaluate_per_trial=3,
)

	OLOPRO> Running initial search for sem_choose ---
	OLOPRO> Running search 1 for sem_choose ---
--- sempipes.apply_with_sem_choose(TableVectorizer(high_cardinality=choose_from([StringEncoder(), GapEncoder()], name='__sempipes__...h_cardinality')), SemChoices(name='tv', params_and_prompts={'high_cardinality': 'Two encoders appropriate for categorical survey data'}))
	> Querying 'openai/gpt-4.1' with 2 messages...'
	Suggested choices for high_cardinality: choose_from([GapEncoder(), MinHashEncoder()], name='__sempipes__...h_cardinality')
--- sempipes.apply_with_sem_choose(HistGradientBoostingClassifier(learning_rate=choose_from([0.1, 0.05, 0.2], name='__sempipes__...learning_rate'),
                               min_samples_leaf=choose_from([5, 10, 20], name='__sempipes__..._samples_leaf')), SemChoices(name='hgb', params_and_prompts={'learning_rate': 'Three learning rates to try', 'min_samples_leaf': 'A single value appropriate for the rather small data at hand'}))
	> Querying 'openai/gp

In [3]:
import pandas as pd

pd.concat(results)

Unnamed: 0,__sempipes__tv__high_cardinality,__sempipes__hgb__learning_rate,__sempipes__hgb__min_samples_leaf,mean_test_score
0,StringEncoder(),0.2,10,0.802
1,StringEncoder(),0.1,10,0.798
2,StringEncoder(),0.1,20,0.778
0,MinHashEncoder(),0.15,5,0.756
1,GapEncoder(),0.15,15,0.682
2,GapEncoder(),0.3,5,0.386
0,GapEncoder(),0.07,10,0.686
1,GapEncoder(),0.07,8,0.66
2,GapEncoder(),0.07,20,0.658
