In [1]:
import numpy as np
import skrub 
from skrub import TableVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier
from gyyre import *

dataset = skrub.datasets.fetch_midwest_survey()

responses = skrub.var("response", dataset.X) 
responses = responses.skb.subsample(n=100).skb.set_description(dataset.metadata['description'])

labels = skrub.var("labels", dataset.y)
labels = labels.skb.subsample(n=100).skb.set_name(dataset.metadata['target'])

responses = responses.skb.mark_as_X()
labels = labels.skb.mark_as_y()

responses_with_additional_features = responses.with_sem_features(
    nl_prompt="""
        Compute additional demographics-related features, use your intrinsic knowledge about the US. 
        Take into account how the identification with the country or regions of it changed over the generations.         
        Also think about how the identification differs per class and education. The midwest is generally associated 
        with "Midwestern values" â€” friendliness, modesty, hard work, and community-mindedness.
    """, 
    name="demographic_features", 
    how_many=5
)

feature_encoder = TableVectorizer()
encoded_responses = responses_with_additional_features.skb.apply(feature_encoder)

learner = HistGradientBoostingClassifier()
model = encoded_responses.skb.apply(learner, y=labels)

--- Fitting gyyre.with_sem_features('Compute additional demographics...', 5)
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> Computed 5 new feature columns: ['Age_Group_Num', 'Education_Level_Num', 'Income_Bracket_Num', 'Is_Midwest_Resident', 'Midwest_Identity_Strong'], removed 0 feature columns: []


In [2]:
memory, states = greedy_optimise_semantic_operator(
    model, 
    "demographic_features", 
    num_iterations=5,
    scoring="accuracy",
    cv=3,
)

--- COMPUTING DAG SUMMARY for context-aware optimisation ---
---ITERATION 0 -> Fitting with memory ---
--- Fitting gyyre.with_sem_features('Compute additional demographics...', 5) for a classification task, predicting `Census_Region` with sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> Computed 5 new feature columns: ['age_group_num', 'education_level_num', 'household_income_num', 'midwest_identity_score', 'region_self_label'], removed 0 feature columns: []
---ITERATION 0 -> Evaluation ---
--- Using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
--- Using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
--- Using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
---ITERATION 0 -> accuracy: 0.7462288638958313
---ITERATION 1 -> Fitting with memory ---
--- Fitting gyyre.with_sem_features('Com

In [3]:
highest_accuracy = 0.0
corresponding_state = None

for memory_line, state in zip(memory, states):
    if memory_line['score'] > highest_accuracy:
        highest_accuracy = memory_line['score']
        corresponding_state = state

print(f"Highest accuracy: {highest_accuracy}\n\n")
print("\n".join(corresponding_state['generated_code']))


Highest accuracy: 0.7999287427874973


# (Feature name and description)
# region_self_label: Standardized region label from self-reported region, mapped to Census regions.
# Usefulness: Many respondents use non-standard or misspelled region names. Mapping these to Census regions provides a strong prior for classification, especially when self-identification is clear.
# Input samples: 'What_would_you_call_the_part_of_the_country_you_live_in_now': ['Southern', 'Midwest', 'Mid-west']
def map_region_label(val):
    if pd.isnull(val):
        return 'Unknown'
    v = val.lower()
    if 'midwest' in v or 'mid-west' in v or 'upper midwest' in v or 'midewest' in v or 'mid west' in v:
        return 'Midwest'
    if 'south' in v:
        return 'South'
    if 'east' in v or 'atlantic' in v:
        return 'Northeast'
    if 'west' in v or 'pacific' in v or 'mountain' in v:
        return 'West'
    return 'Other'
df['region_self_label'] = df['What_would_you_call_the_part_of_the_country_you_live