In [1]:
import numpy as np
import skrub 
from skrub import TableVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier
from gyyre import *

dataset = skrub.datasets.fetch_midwest_survey()

responses = skrub.var("response", dataset.X) 
responses = responses.skb.subsample(n=100).skb.set_description(dataset.metadata['description'])

labels = skrub.var("labels", dataset.y)
labels = labels.skb.subsample(n=100).skb.set_name(dataset.metadata['target'])

responses = responses.skb.mark_as_X()
labels = labels.skb.mark_as_y()

responses_with_additional_features = responses.with_sem_features("""
        Compute additional demographics-related features, use your intrinsic knowledge about the US. 
        Take into account how the identification with the country or regions of it changed over the generations. 
        Also think about how the identification differs per class and education.""", 
    name="demographic_features", how_many=5)

feature_encoder = TableVectorizer()
encoded_responses = responses_with_additional_features.skb.apply(feature_encoder)

model = HistGradientBoostingClassifier()
predictions = encoded_responses.skb.apply(model, y=labels)

--- Fitting gyyre.with_sem_features('Compute additional demographics...', 5)
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> Computed 5 new feature columns: ['Age_Group_Num', 'Education_Level_Num', 'Household_Income_Num', 'Midwest_Identification_Score', 'Midwest_State_Affirm_Count'], removed 0 feature columns: []


In [2]:
memory, states = optimise_semantic_operator(predictions, "demographic_features", num_iterations=5)

--- COMPUTING DAG SUMMARY for context-aware optimisation ---
	> task_type: classification
	> model: sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier
	> model_definition:   File "/var/folders/ln/t4s9x03j5nz8x1r5tc40xzr00000gn/T/ipykernel_34733/1696717159.py", line 28, in <module>     predictions = encoded_responses.skb.apply(model, y=labels) 
	> model_steps: Var 'response' > SubsamplePreviews > Var 'gyyre_dag_summary__demographic_features' > Var 'gyyre_memory__demographic_features' > Var 'gyyre_prefitted_state__demographic_features' > demographic_features | Apply LLMFeatureGenerator > Apply TableVectorizer > Var 'labels' > Census_Region | SubsamplePreviews > Apply HistGradientBoostingClassifier
	> target_name: Census_Region
	> target_definition:   File "/var/folders/ln/t4s9x03j5nz8x1r5tc40xzr00000gn/T/ipykernel_34733/1696717159.py", line 13, in <module>     labels = labels.skb.subsample(n=100).skb.set_name(dataset.metadata['target']) 
	> target_s

In [3]:
highest_accuracy = 0.0
corresponding_state = None

for memory_line, state in zip(memory, states):
    if memory_line['accuracy'] > highest_accuracy:
        highest_accuracy = memory_line['accuracy']
        corresponding_state = state

print(f"Highest accuracy: {highest_accuracy}\n\n")
print("\n".join(corresponding_state['generated_code']))


Highest accuracy: 0.79832473232744


# (Age_Numeric: Map age bands to numeric group for generational effects)
# Usefulness: Generation cohorts have different identification patterns with regions ("Midwesterner") due to historical, social, and migration factors. Converts 'Age' string buckets into ordinal/continuous representation—useful for regressors/tree-based splits.
# Input samples: {'Age': ['18-29', '30-44', '45-60']}
df["Age_Numeric"] = df["Age"].map({
    '18-29': 1,
    '30-44': 2,
    '45-60': 3,
    '61+': 4
})

# (Income_Numeric: Convert income bucket to ordinal scale)
# Usefulness: Economic status is correlated with regional identity and location patterns; this captures ordinal effect of income on Census_Region and allows splits for classifier.
# Input samples: {'Household_Income': ['$50,000 - $99,999', '$0 - $24,999', '$150,000+']}
df["Income_Numeric"] = df["Household_Income"].map({
    '$0 - $24,999': 1,
    '$25,000 - $49,999': 2,
    '$50,000 - $99,999': 3,
    '$100,000