In [1]:
import warnings

import skrub
from sklearn.ensemble import HistGradientBoostingClassifier
from skrub import TableVectorizer

import sempipes
from sempipes import sem_choose

warnings.filterwarnings("ignore")

sempipes.update_config(
    llm_for_code_generation=sempipes.LLM(
        name="gemini/gemini-2.5-flash",
        parameters={"temperature": 2.0},
    )
)

dataset = skrub.datasets.fetch_midwest_survey()
X = dataset.X.head(n=500)
y = dataset.y.head(n=500)

responses = skrub.var("response", dataset.X)
responses = responses.skb.set_description(dataset.metadata["description"])

labels = skrub.var("labels", dataset.y)
labels = labels.skb.set_name(dataset.metadata["target"])

responses = responses.skb.mark_as_X()
labels = labels.skb.mark_as_y()

responses_with_additional_features = responses.sem_gen_features(
    nl_prompt="""
        Compute additional demographics-related features, use your intrinsic knowledge about the US. 
        Take into account how the identification with the country or regions of it changed over the generations.         
        Also think about how the identification differs per class and education. The midwest is generally associated 
        with "Midwestern values" â€” friendliness, modesty, hard work, and community-mindedness.
    """,
    name="demographic_features",
    how_many=5,
)

feature_encoder = TableVectorizer()
encoded_responses = responses_with_additional_features.skb.apply(feature_encoder)

learner = HistGradientBoostingClassifier()
model = encoded_responses.skb.apply_with_sem_choose(
    learner,
    y=labels,
    choices=sem_choose(
        "hgb",
        learning_rate="A promising set of learning rates to try",
    ),
)

2026-02-13 09:34:20,530 - INFO - SEMPIPES> Fitting sempipes.sem_gen_features('Compute additional demographics...', 5) on dataframe of shape (2494, 28) in mode 'preview'.
2026-02-13 09:34:20,541 - INFO - SEMPIPES> Querying 'gemini/gemini-2.5-flash' with 2 messages...'
[92m09:34:20 - LiteLLM:INFO[0m: utils.py:3416 - 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
2026-02-13 09:34:20,551 - INFO - SEMPIPES> 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
[92m09:34:34 - LiteLLM:INFO[0m: utils.py:1301 - Wrapper: Completed Call, calling success_handler
2026-02-13 09:34:34,029 - INFO - SEMPIPES> Wrapper: Completed Call, calling success_handler
2026-02-13 09:34:34,083 - INFO - SEMPIPES> Computed 13 new feature columns: ['age_group_numeric', 'age_x_midwest_identity_interaction', 'education_level_numeric', 'education_midwest_identity_level', 'gender_midwest_identity_level', 'gender_numeric', 'household_income_midwest_identity_level', 'identification_coherence

--- sempipes.apply_with_sem_choose(HistGradientBoostingClassifier(), SemChoices(name='hgb', params_and_prompts={'learning_rate': 'A promising set of learning rates to try'}))


2026-02-13 09:34:36,507 - INFO - SEMPIPES> Querying 'gemini/gemini-2.5-flash' with 2 messages...'
[92m09:34:36 - LiteLLM:INFO[0m: utils.py:3416 - 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
2026-02-13 09:34:36,510 - INFO - SEMPIPES> 
LiteLLM completion() model= gemini-2.5-flash; provider = gemini
[92m09:34:39 - LiteLLM:INFO[0m: utils.py:1301 - Wrapper: Completed Call, calling success_handler
2026-02-13 09:34:39,456 - INFO - SEMPIPES> Wrapper: Completed Call, calling success_handler


	Suggested choices for learning_rate: choose_from([0.1, 0.05, 0.01, 0.2], name='__sempipes__...learning_rate')


In [2]:
from sempipes.optimisers import MonteCarloTreeSearch, optimise_colopro

# Note that this should be done on a separate validation set in a real world use case
outcomes = optimise_colopro(
    model,
    "demographic_features",
    num_trials=12,
    scoring="accuracy",
    search=MonteCarloTreeSearch(),
    cv=5,
    num_hpo_iterations_per_trial=1,  # 0,
)

2026-02-13 09:34:44,363 - INFO - SEMPIPES> COLOPRO> Computing pipeline summary for context-aware optimisation
2026-02-13 09:34:44,659 - INFO - SEMPIPES> COLOPRO> Processing trial 0
2026-02-13 09:34:44,660 - INFO - SEMPIPES> COLOPRO> Initialising optimisation of demographic_features via OPRO
2026-02-13 09:34:44,661 - INFO - SEMPIPES> MCT_SEARCH> Creating root node
2026-02-13 09:34:44,662 - INFO - SEMPIPES> COLOPRO> Evaluating pipeline via 5-fold cross-validation and random search HPO
2026-02-13 09:35:00,279 - INFO - SEMPIPES> COLOPRO> Pipeline evaluation took 15.62 seconds
2026-02-13 09:35:00,280 - INFO - SEMPIPES> COLOPRO> Score changed from None to 0.798713088828259
2026-02-13 09:35:00,281 - INFO - SEMPIPES> COLOPRO> Processing trial 1
2026-02-13 09:35:00,282 - INFO - SEMPIPES> Expanding childless node 0
2026-02-13 09:35:00,283 - INFO - SEMPIPES> MCT_SEARCH> Trying to improve node with score 0.798713088828259
2026-02-13 09:35:00,284 - INFO - SEMPIPES> COLOPRO> Generating new search no

In [5]:
best_outcome = max(outcomes, key=lambda x: x.score)
best_outcome.score

0.926228360335128

In [6]:
print("\n".join(best_outcome.state["generated_code"]))


# (Personal identification score as a Midwesterner - RE-CREATED)
# Usefulness: This feature converts the ordinal categorical response for self-identification as a Midwesterner into a numerical score. This allows the model to leverage the strength of a respondent's personal regional identity, which is highly relevant to "Census_Region" prediction. This column is re-created because it was missing in the previous execution context.
# Input samples: 'How_much_do_you_personally_identify_as_a_Midwesterner': ['Not much', 'Not much', 'A lot']
identity_mapping = {'Not at all': 0, 'Not much': 1, 'Some': 2, 'A lot': 3}
df['midwesterner_identity_score'] = df['How_much_do_you_personally_identify_as_a_Midwesterner'].map(identity_mapping)

# (Ordinal encoding of age groups - RE-CREATED)
# Usefulness: This feature converts the categorical 'Age' column into an ordinal numerical scale. This transformation enables the model to capture generational differences in regional identification and the evolution