In [1]:
import skrub
from sklearn.ensemble import HistGradientBoostingClassifier
from skrub import TableVectorizer
import gyyre

dataset = skrub.datasets.fetch_midwest_survey()

responses = skrub.var("response", dataset.X)
responses = responses.skb.subsample(n=100).skb.set_description(dataset.metadata["description"])

labels = skrub.var("labels", dataset.y)
labels = labels.skb.subsample(n=100).skb.set_name(dataset.metadata["target"])

responses = responses.skb.mark_as_X()
labels = labels.skb.mark_as_y()

responses_with_additional_features = responses.with_sem_features(
    nl_prompt="""
        Compute additional demographics-related features, use your intrinsic knowledge about the US. 
        Take into account how the identification with the country or regions of it changed over the generations.         
        Also think about how the identification differs per class and education. The midwest is generally associated 
        with "Midwestern values" â€” friendliness, modesty, hard work, and community-mindedness.
    """,
    name="demographic_features",
    how_many=5,
)

feature_encoder = TableVectorizer()
encoded_responses = responses_with_additional_features.skb.apply(feature_encoder)

learner = HistGradientBoostingClassifier()
model = encoded_responses.skb.apply(learner, y=labels)

--- Fitting gyyre.with_sem_features('Compute additional demographics...', 5)
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> Computed 5 new feature columns: ['Age_Group_Num', 'Education_Level_Num', 'Income_Bracket_Num', 'Is_Midwesterner_by_Region', 'Midwest_States_Identified'], removed 0 feature columns: []


In [3]:
memory, states = gyyre.greedy_optimise_semantic_operator(
    model,
    "demographic_features",
    num_iterations=5,
    scoring="accuracy",
    cv=3,
)

--- COMPUTING DAG SUMMARY for context-aware optimisation ---
---ITERATION 0 -> Fitting with memory ---
--- Fitting gyyre.with_sem_features('Compute additional demographics...', 5) for a classification task, predicting `Census_Region` with sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> Computed 5 new feature columns: ['age_group_numeric', 'education_level_numeric', 'household_income_numeric', 'is_midwest_state_majority', 'midwest_identity_score'], removed 0 feature columns: []
---ITERATION 0 -> Evaluation ---
--- Using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
--- Using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
--- Using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
---ITERATION 0 -> accuracy: 0.7137523527415225
---ITERATION 1 -> Fitting with memory ---
--- Fitting gyyre.wi

In [4]:
highest_accuracy = 0.0
corresponding_state = None

for memory_line, state in zip(memory, states):
    if memory_line["score"] > highest_accuracy:
        highest_accuracy = memory_line["score"]
        corresponding_state = state

print(f"Highest accuracy: {highest_accuracy}\n\n")
print("\n".join(corresponding_state["generated_code"]))

Highest accuracy: 0.7470219499213182


# (Feature name and description)
# midwest_identity_score: Numeric score encoding how strongly a respondent identifies as a Midwesterner.
# Usefulness: Quantifies the ordinal self-identification with the Midwest, which is likely to correlate with actual residence in the Midwest and thus Census_Region.
# Input samples: 'How_much_do_you_personally_identify_as_a_Midwesterner': ['Not much', 'A lot', 'Some']
midwest_identity_map = {
    'Not at all': 0,
    'Not much': 1,
    'Some': 2,
    'A lot': 3
}
df['midwest_identity_score'] = df['How_much_do_you_personally_identify_as_a_Midwesterner'].map(midwest_identity_map).fillna(-1)

# (Feature name and description)
# age_group_numeric: Ordinal encoding of age group.
# Usefulness: Age group is related to generational differences in regional identification and migration patterns, which can help distinguish Census_Region.
# Input samples: 'Age': ['18-29', '30-44', '45-60']
age_map = {
    '18-29': 0,
    '30