In [1]:
import warnings

import skrub
from sklearn.ensemble import HistGradientBoostingClassifier
from skrub import TableVectorizer

import sempipes
from sempipes import sem_choose

warnings.filterwarnings("ignore")

sempipes.update_config(
    llm_for_code_generation=sempipes.LLM(
        name="gemini/gemini-2.5-flash",
        parameters={"temperature": 0.3},
    )
)

dataset = skrub.datasets.fetch_midwest_survey()
X = dataset.X.head(n=500)
y = dataset.y.head(n=500)

responses = skrub.var("response", dataset.X)
responses = responses.skb.set_description(dataset.metadata["description"])

labels = skrub.var("labels", dataset.y)
labels = labels.skb.set_name(dataset.metadata["target"])

responses = responses.skb.mark_as_X()
labels = labels.skb.mark_as_y()

responses_with_additional_features = responses.sem_gen_features(
    nl_prompt="""
        Compute additional demographics-related features, use your intrinsic knowledge about the US. 
        Take into account how the identification with the country or regions of it changed over the generations.         
        Also think about how the identification differs per class and education. The midwest is generally associated 
        with "Midwestern values" â€” friendliness, modesty, hard work, and community-mindedness.
    """,
    name="demographic_features",
    how_many=5,
)

feature_encoder = TableVectorizer()
encoded_responses = responses_with_additional_features.skb.apply(feature_encoder)

learner = HistGradientBoostingClassifier()
model = encoded_responses.skb.apply_with_sem_choose(
    learner,
    y=labels,
    choices=sem_choose(
        "hgb",
        learning_rate="A promising set of learning rates to try",
    ),
)

--- Fitting sempipes.with_sem_features('Compute additional demographics...', 5) on dataframe of shape (2494, 28) in mode 'preview'.
	> Querying 'gemini/gemini-2.5-flash' with 2 messages...'
	> Computed 5 new feature columns: ['Age_Ordinal', 'Education_Ordinal', 'Household_Income_Ordinal', 'Midwest_States_Count', 'Midwesterner_Identification_Score'], removed 0 feature columns: []
--- sempipes.apply_with_sem_choose(HistGradientBoostingClassifier(), SemChoices(name='hgb', params_and_prompts={'learning_rate': 'A promising set of learning rates to try'}))
	> Querying 'gemini/gemini-2.5-flash' with 2 messages...'
	Suggested choices for learning_rate: choose_from([0.1, 0.05, 0.2, 0.01, 0.5], name='__sempipes__...learning_rate')


In [2]:
from sempipes.optimisers import MonteCarloTreeSearch, optimise_colopro

# Note that this should be done on a separate validation set in a real world use case
outcomes = optimise_colopro(
    model,
    "demographic_features",
    num_trials=12,
    scoring="accuracy",
    search=MonteCarloTreeSearch(nodes_per_expansion=2, c=1.41),
    cv=5,
    num_hpo_iterations_per_trial=1,  # 0,
)

	OLOPRO> Computing pipeline summary for context-aware optimisation ---
	OLOPRO> Processing trial 0
	OLOPRO> Initialising optimisation of demographic_features via OPRO
	MCT_SEARCH> Creating root node
	OLOPRO> Evaluating pipeline via 5-fold cross-validation and random search HPO
--- Using provided state for sempipes.with_sem_features('Compute additional demographics...', 5)
	OLOPRO>    __sempipes__hgb__learning_rate  mean_test_score
	OLOPRO> 0                             0.2         0.796316
	OLOPRO> Pipeline evaluation took 8.95 seconds
	OLOPRO> Score changed from None to 0.796315522611488
	OLOPRO> Processing trial 1
	MCT_SEARCH> Trying to improve node with score 0.796315522611488
	OLOPRO> Generating 2 new search node(s)
	OLOPRO> Evolving operator "demographic_features" via OPRO
--- Fitting sempipes.with_sem_features('Compute additional demographics...', 5) on dataframe of shape (2494, 28) in mode 'fit_transform'.
	> Querying 'gemini/gemini-2.5-flash' with 4 messages...'
	> Computed 5 n

In [3]:
best_outcome = max(outcomes, key=lambda x: x.score)
best_outcome.score

np.float64(0.9258251442644324)

In [4]:
print("\n".join(best_outcome.state["generated_code"]))


import pandas as pd
import numpy as np

# (midwestern_identification_score)
# Usefulness: This numerical feature quantifies the strength of a respondent's personal identification as a Midwesterner, directly addressing the core survey theme. It provides an ordinal measure of self-identity, crucial for understanding regional affiliation and "Midwestern values" across different demographics.
# Input samples: 'How_much_do_you_personally_identify_as_a_Midwesterner': ['Not much', 'Not much', 'A lot']
identification_mapping = {'Not at all': 0, 'Not much': 1, 'Some': 2, 'A lot': 3}
df['midwestern_identification_score'] = df['How_much_do_you_personally_identify_as_a_Midwesterner'].map(identification_mapping)

# (age_group_numeric)
# Usefulness: This numerical feature converts age ranges into an ordinal scale, allowing the model to capture generational differences in regional identification. This directly addresses the prompt's focus on how identification changes over generations, which can inf