## Global arguments

In [12]:
num_people = 5 # Number of simulated survey respondents.
num_workers = 4 # Number of parallel threads to use for simulation.

## How to preproceess the data

### Loading the GSS dataset

In [13]:
import pyreadstat
import pandas as pd

from typing import Any

def from_gss(file_path: str, n_sample: int | None = None, random_state: int = 42) -> tuple[pd.DataFrame, Any]:
    df, meta = pyreadstat.read_sav(file_path)
    if n_sample is not None:
        df = df.sample(n=n_sample, random_state=random_state).reset_index(drop=True)
    return df, meta

data, meta = from_gss("./datasets/2024_spss/2024/GSS2024.sav", n_sample=num_people)
print(f"data.shape = {data.shape}")
data.head()

data.shape = (5, 813)


Unnamed: 0,fileversion,year,id,abany,abanyg,abdefect,abdefectg,abhlth,abhlthg,abnomore,...,wrkstat,wrkwayup,wrldgovt,wtssnrps,wtssps,xmarsex,xmovie,xmoviey,xnorcsiz,yousup
0,7224.2,2024.0,53.0,1.0,,1.0,,1.0,,1.0,...,1.0,4.0,,0.44941,0.359231,1.0,,,1.0,
1,7224.2,2024.0,681.0,,,,,,,,...,1.0,3.0,,0.500233,0.65602,,1.0,,1.0,
2,7224.2,2024.0,1377.0,1.0,1.0,1.0,,1.0,,1.0,...,5.0,,,1.11206,1.108583,1.0,2.0,,3.0,
3,7224.2,2024.0,1238.0,,,,,,,,...,1.0,,,2.569419,2.363848,,2.0,,6.0,
4,7224.2,2024.0,204.0,,,,,,,,...,5.0,1.0,1.0,0.602956,0.551818,,,1.0,3.0,


### Integer, floating, or nominal/ordinal?

In [14]:
from typing import Literal

def infer_resp_type(col: str, df: pd.DataFrame, meta: Any) -> Literal["int", "float", "nominal/ordinal", "unknown"]:
    unique_vals = df[col].dropna().unique()
    val2txt = {k: v for k, v in meta.variable_value_labels.get(col, {}).items() if 0 <= k}
    if len(val2txt) < len(unique_vals):
        if all(float(v).is_integer() for v in unique_vals):
            return "int"
        return "float"
    return "nominal/ordinal"

### Exploring the GSS dataset

In [15]:
from ipywidgets import interact, IntRangeSlider

def inspect_column(variable: str, from_to: tuple[int, int]) -> None:
    column = data[variable]
    print(f"Question: {meta.column_names_to_labels.get(variable)} ({(ctype := infer_resp_type(variable, data, meta))})")
    if ctype == "nominal/ordinal":
        val2txt = {k: v for k, v in meta.variable_value_labels.get(variable, {}).items() if 0 <= k}
        print(f"Value labels:\n  {'\n  '.join(f'{k}: {v}' for k, v in val2txt.items())}")
    start, end = from_to
    rows = column.iloc[start:end]
    for idx, value in enumerate(rows.tolist()):
        label = meta.variable_value_labels.get(variable, {}).get(value)
        print(f"{start + idx}: {value}{f', i.e., {label}' if label is not None else ''}")

interact(
    inspect_column, 
    variable=data.columns, 
    from_to=IntRangeSlider(
        min=0, 
        max=len(data), 
        step=1, 
        value=[0, 10], 
        description='Select Rows'
    )
);

interactive(children=(Dropdown(description='variable', options=('fileversion', 'year', 'id', 'abany', 'abanyg'…

## How to make predictions

### Which race is this persona?

In [16]:
from collections.abc import Mapping

def race_from_gss(row: pd.Series, df: pd.DataFrame, meta: Any) -> str | None:
    relevant = [
        'raceacs1',
        'raceacs2', 
        'raceacs3',
        'raceacs4',
        'raceacs5',
        'raceacs6',
        'raceacs7',
        'raceacs8',
        'raceacs9',
        'raceacs10',
        'raceacs14',
        'raceacs16'
    ]
    races = df[relevant].columns.tolist()

    stack: list[str] = []
    for code in races:
        label = meta.column_names_to_labels.get(code)
        flag = meta.variable_value_labels.get(code, {}).get(row[code])
        if flag == "yes":
            stack.append(label)
    
    if not stack:
        return None
    elif len(stack) == 1:
        return stack[0]
    else:
        return ", ".join(stack[:-1]) + f", and {stack[-1]}"

### Making persona descriptions

In [17]:

def persona_desc(row: pd.Series, df: pd.DataFrame, meta: Any) -> Mapping[str, str]:
    out =  {
        "age": str(int(row["age"])) if not pd.isna(row["age"]) else None,
        "sex": meta.variable_value_labels.get("sex", {}).get(row["sex"]),
        "race": race_from_gss(row, df, meta),
        "religion": meta.variable_value_labels.get("relig", {}).get(row["relig"]),
        "marital_status": meta.variable_value_labels.get("marital", {}).get(row["marital"]),
        "employment_status": meta.variable_value_labels.get("wrkstat", {}).get(row["wrkstat"]),
        "political_views": meta.variable_value_labels.get("polviews", {}).get(row["polviews"]),
        "born_in_usa": meta.variable_value_labels.get("born", {}).get(row["born"]),
        "education_level": meta.variable_value_labels.get("educ", {}).get(row["educ"])
    }
    return {k: v for k, v in out.items() if v is not None}

### Exploring personas descriptions

In [18]:
def inspect_persona(idx: int) -> None:
    row = data.iloc[idx]
    persona = persona_desc(row, data, meta)
    print(f"Persona #{idx}:")
    for k, v in persona.items():
        print(f"  {k}: {v}")

interact(inspect_persona, idx=(0, len(data)-1, 1));

interactive(children=(IntSlider(value=2, description='idx', max=4), Output()), _dom_classes=('widget-interact'…

### Making predictions

In [19]:
from genagents.genagents import GenerativeAgent

def respond(col: str, idx: int, df: pd.DataFrame, meta: Any) -> float | None:
    row = df.iloc[idx]
    agent = GenerativeAgent(); agent.update_scratch(persona_desc(row, df, meta))
    question = meta.column_names_to_labels.get(col)
    resp_type = infer_resp_type(col, df, meta)
    labels = {k: v for k, v in meta.variable_value_labels.get(col, {}).items() if 0 <= k}
    if resp_type == "int" or resp_type == "float":
        min = float(df[col].dropna().min())
        max = float(df[col].dropna().max())
        resp_and_rationale = agent.numerical_resp({question: [min, max]}, float_resp=(resp_type == "float"))
        if resp_and_rationale is not None and "responses" in resp_and_rationale:
            resp = resp_and_rationale["responses"]
            if resp and len(resp) == 1:
                return float(resp[0])
    elif resp_type == "nominal/ordinal" and labels is not None:
        resp_and_rationale = agent.categorical_resp({question: labels.values()})
        if resp_and_rationale is not None and "responses" in resp_and_rationale:
            resp = resp_and_rationale["responses"]
            if resp and len(resp) == 1:
                for k, v in labels.items():
                    if v == resp[0]:
                        return k
    return None

### Exploring predictions

In [20]:
from ipywidgets import interact_manual

def inspect_response(col: str, idx: int) -> None:
    column = data[col]
    print(f"Question: {meta.column_names_to_labels.get(col)} ({(ctype := infer_resp_type(col, data, meta))})")
    if ctype == "nominal/ordinal":
        val2txt = {k: v for k, v in meta.variable_value_labels.get(col, {}).items() if 0 <= k}
        print(f"Value labels:\n  {'\n  '.join(f'{k}: {v}' for k, v in val2txt.items())}")
    resp = respond(col, idx, data, meta)
    print(f"Response: {resp}{f', i.e., {val2txt.get(resp)}' if ctype == 'nominal/ordinal' and resp in val2txt else ''}")
    print(f"Target: {column.iloc[idx]}{f', i.e., {val2txt.get(column.iloc[idx])}' if ctype == 'nominal/ordinal' and column.iloc[idx] in val2txt else ''}")

interact_manual(inspect_response, col=list(data.columns), idx=(0, len(data)-1, 1));

interactive(children=(Dropdown(description='col', options=('fileversion', 'year', 'id', 'abany', 'abanyg', 'ab…

### Running simulations

In [21]:
cols_to_predict = [
    'natspacy',
    'natenviy',
    'nathealy',
    'natcityy',
    'natdrugy',
    'nateducy',
    'natracey',
    'natarmsy',
    'nataidy',
    'natfarey',
    'natroad',
    'natsoc',
    'natspac',
    'natenvir',
    'natheal',
    'natcity',
    'natdrug',
    'nateduc',
    'natrace',
    'natarms',
    'nataid',
    'natfare',
    'natchld',
    'natsci',
    'natenrgy',
    'prayer',
    'courts',
    'discaffw',
    'discaffm',
    'fehire',
    'fechld',
    'fepresch',
    'fefam',
    'fepol',
    'reg16',
    'mobile16',
    'famdif16',
    'incom16',
    'dwelown16',
    'paeduc',
    'padeg',
    'maeduc',
    'madeg',
    'mawrkgrw',
    'marital',
    'widowed',
    'divorce',
    'martype',
    'posslqy',
    'wrkstat',
    'evwork',
    'wrkgovt1',
    'wrkgovt2',
    'partfull',
    'wksub1',
    'wksup1',
    'conarmy',
    'conbus',
    'conclerg',
    'coneduc',
    'confed',
    'confinan',
    'conjudge',
    'conlabor',
    'conlegis',
    'conmedic',
    'conpress',
    'consci',
    'contv',
    'vetyears',
    'joblose',
    'jobfind',
    'happy',
    'hapmar',
    'satjob',
    'speduc',
    'spdeg',
    'spwrksta',
    'spfund',
    'unemp',
    'union1',
    'spkathy',
    'libathy',
    'colath',
    'spkracy',
    'libracy',
    'spkcomy',
    'libcomy',
    'colcomy',
    'colrac',
    'spkmslmy',
    'libmslmy',
    'cappun',
    'polhitoky',
    'polabusey',
    'polattaky',
    'grass',
    'gunlaw',
    'owngun',
    'hunt1',
    'class',
    'satfin',
    'finalter',
    'finrela',
    'race',
    'racdif1',
    'racdif2',
    'racdif3',
    'racdif4',
    'wlthwhts',
    'wlthblks',
    'wlthhsps',
    'racwork',
    'letin1a',
    'getahead',
    'parsol',
    'kidssol',
    'spanking',
    'divlaw',
    'sexeduc',
    'pillok',
    'xmarsex',
    'homosex',
    'discaff',
    'abdefect',
    'abnomore',
    'abhlth',
    'abpoor',
    'abrape',
    'absingle',
    'abany',
    'letdie1',
    'suicide1',
    'suicide2',
    'suicide4',
    'pornlaw',
    'fair',
    'helpful',
    'trust',
    'tax',
    'vote16',
    'pres16',
    'if16who',
    'polviews',
    'partyid',
    'news',
    'relig',
    'relig16',
    'attend',
    'pray',
    'postlife',
    'bible',
    'reborn',
    'relpersn',
    'sprtprsn',
    'born',
    'granborn',
    'uscitzn',
    'educ',
    'degree',
    'income',
    'visitors',
    'dwelown',
    'othlang',
    'sex',
    'hispanic',
    'health',
    'compuse',
    'webmob',
    'xmovie',
    'life',
    'richwork'
]

print(f"# of columns to predict: {len(cols_to_predict)}")
print(f"# of columns also in GSS: {len({k for k in cols_to_predict if k in data})}")

# of columns to predict: 172
# of columns also in GSS: 172


In [22]:
assert False, "This is put here for safety; the following cell is expensive to run."

AssertionError: This is put here for safety; the following cell is expensive to run.

In [None]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

pred = data.copy()
tasks = [(col, idx) for idx in range(len(data)) for col in cols_to_predict if col in data.columns]
with ThreadPoolExecutor(max_workers=num_workers) as exe:
    futures = {exe.submit(respond, col, idx, data, meta): (col, idx) for col, idx in tasks}
    for future in tqdm(as_completed(futures), total=len(futures), desc=f"{len(data)} respondents x {len(cols_to_predict)} questions"):
        col, idx = futures[future]
        try:
            result = future.result()
            if result is not None:
                pred.at[idx, col] = result
        except Exception as err:
            print(f"[ERROR] in col {col}, idx {idx}: {err}")