In [5]:
import numpy as np
import skrub 
from skrub import TableVectorizer
from sklearn.ensemble import HistGradientBoostingClassifier
from gyyre import *

dataset = skrub.datasets.fetch_midwest_survey()

responses = skrub.var("response", dataset.X) 
responses = responses.skb.subsample(n=100)

labels = skrub.var("labels", dataset.y)
labels = labels.skb.subsample(n=100)

responses = responses.skb.mark_as_X()
labels = labels.skb.mark_as_y()

responses_with_additional_features = responses.with_sem_features("""
        Compute additional demographics-related features, use your intrinsic knowledge about the US. 
        Take into account how the identification with the country or regions of it changed over the generations. 
        Also think about how the identification differs per class and education.""", 
    name="demographic_features", how_many=5)

feature_encoder = TableVectorizer()
encoded_responses = responses_with_additional_features.skb.apply(feature_encoder)

model = HistGradientBoostingClassifier()
predictions = encoded_responses.skb.apply(model, y=labels)

--- Fitting gyyre.with_sem_features('Compute additional demographics...', 5)
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> Computed 5 new feature columns: ['Education_Level_Code', 'Estimated_Generation', 'Household_Income_Bin', 'Is_Midwestern_Resident', 'Midwestern_Identity_Strength'], removed 0 feature columns: []


In [6]:
memory, states = optimise_semantic_operator(predictions, "demographic_features", num_iterations=5)

---ITERATION 0 -> Fitting with memory ---
--- Fitting gyyre.with_sem_features('Compute additional demographics...', 5)
	> Querying 'openai/gpt-4.1' with 2 messages...'
	> An error occurred in attempt 1: name 'enumerate' is not defined
	> Querying 'openai/gpt-4.1' with 4 messages...'
	> Computed 1 new feature columns: ['Education_Level'], removed 0 feature columns: []
---ITERATION 0 -> Evaluation ---
--- Skipping fit, using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
--- Skipping fit, using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
--- Skipping fit, using provided state for gyyre.with_sem_features('Compute additional demographics...', 5)
---ITERATION 0 -> Accuracy: 0.7345967748156377
---ITERATION 1 -> Fitting with memory ---
--- Fitting gyyre.with_sem_features('Compute additional demographics...', 5)
	> Querying 'openai/gpt-4.1' with 4 messages...'
	> Computed 6 new feature columns: ['Age_Group', 'Educatio

In [7]:
highest_accuracy = 0.0
corresponding_state = None

for memory_line, state in zip(memory, states):
    if memory_line['accuracy'] > highest_accuracy:
        highest_accuracy = memory_line['accuracy']
        corresponding_state = state

print(f"Highest accuracy: {highest_accuracy}\n\n")
print("\n".join(corresponding_state['generated_code']))


Highest accuracy: 0.7590445555246998


# (Education_Level: Ordinal encoding for Education, capturing differences in identification by education)
# Input samples: {'Education': ['High school degree', 'Some college', 'Associate or bachelor degree', 'Graduate degree']}
education_order = [
    'Less than high school', 
    'High school degree', 
    'Some college', 
    'Associate or bachelor degree', 
    'Graduate degree', 
    'PhD or professional degree'
]
education_map = {
    'Less than high school': 0,
    'High school degree': 1,
    'Some college': 2,
    'Associate or bachelor degree': 3,
    'Graduate degree': 4,
    'PhD or professional degree': 5
}
df['Education_Level'] = df['Education'].map(education_map).fillna(-1).astype(int)
# (Age_Group: Ordinal encoding for Age, capturing generational differences in regional identification)
# Input samples: {'Age': ['18-29', '30-44', '45-64', '65+']}
age_map = {
    '18-29': 0,
    '30-44': 1,
    '45-64': 2,
    '65+': 3
}
df['Age_Group