## Preamble

In [1]:
import json
import pyreadstat
import pandas as pd

from typing import Any
from ipywidgets import interact
from collections import OrderedDict
from tqdm import tqdm
from colorama import Fore, Style


def spss_to_dict(file_path: str, verbose: bool = False) -> OrderedDict[str, dict[str, Any]]:
    df, meta = pyreadstat.read_sav(file_path)
    df_labeled = pyreadstat.set_value_labels(df, meta)
    gss_resp: OrderedDict[str, dict[str, Any]] = OrderedDict()
    for col in tqdm(df_labeled.columns, disable=not verbose, desc=r"[cols processed]"):
        clean_resp: list[str | None] = []
        for val in df_labeled[col].tolist():
            if pd.isna(val):
                clean_resp.append(None)
            else:
                clean_resp.append(str(val))
        support: OrderedDict[int, str] | None = None
        if col in meta.variable_value_labels:
            support = OrderedDict((code, label) for code, label in sorted(
                meta.variable_value_labels[col].items(), key=lambda x: x[0]
            ) if 0 <= code)
        gss_resp[col] = {
            "question": _no_qst_abbr(meta.column_names_to_labels.get(col, None)),
            "type": _infer_type(clean_resp, support),
            "responses": clean_resp,
            "support": support
        }
    return gss_resp

def _no_qst_abbr(text: str | None) -> str | None:
    if text is None:
        return None
    text = text.strip()
    if text.endswith(" r"):
        text = text[:-1] + "respondent"
    if text.startswith("r'") or text.startswith("r "):
        text = "respondent" + text[1:]
    text = text.replace(" r'", " respondent'")
    text = text.replace(" r ", " respondent ")
    return text

def _infer_type(resps: list[str | None], support: OrderedDict[int, str] | None) -> str:
    if _is_number(resps, support):
        return "scalar"
    return "categorical"

def _is_number(resps: list[str | None], support: OrderedDict[int, str] | None) -> bool:
    support = OrderedDict() if support is None else support
    resps = [r for r in resps if r is not None]
    if len(support) < len({r for r in resps}):
        return True
    return False

In [8]:
def col(txt: str, col: str) -> str:
    return f"{col}{txt}{Style.RESET_ALL}"

In [29]:
core_vars_superset = [
    # Demographic and life-course data
    'AGE', 'ZODIAC', 'COHORT',
    'SEX', 'SEXBIRTH1', 'SEXNOW1', 'SEXNOW2',
    'RACE', 'RACECEN1', 'RACECEN2', 'RACECEN3',
    'RACEACS1', 'RACEACS2', 'RACEACS3', 'RACEACS4', 'RACEACS5',
    'RACEACS6', 'RACEACS7', 'RACEACS8', 'RACEACS9', 'RACEACS10',
    'RACEACS11', 'RACEACS12', 'RACEACS13', 'RACEACS14', 'RACEACS15', 'RACEACS16',
    'ETHNIC', 'ETH1', 'ETH2', 'ETH3',
    'ETHWORLD1', 'ETHWORLD2', 'ETHWORLD3', 'ETHWORLD4', 'ETHWORLD5',
    'ETHWORLD6', 'ETHWORLD7', 'ETHWORLD8', 'ETHWORLD9',
    'ETHREGION1', 'ETHREGION2', 'ETHREGION3', 'ETHREGION4', 'ETHREGION5',
    'ETHREGION6', 'ETHREGION7', 'ETHREGION8', 'ETHREGION9', 'ETHREGION10',
    'ETHREGION11', 'ETHREGION12', 'ETHREGION13', 'ETHREGION14', 'ETHREGION15',
    'ETHREGION16', 'ETHREGION17', 'ETHREGION18', 'ETHREGION19', 'ETHREGION20',
    'ETHREGION21', 'ETHREGION22', 'ETHREGION23', 'ETHREGION24', 'ETHREGION25',
    'ETHREGION26', 'ETHREGION27', 'ETHREGION28', 'ETHREGION29', 'ETHREGION30',
    'ETHREGION31', 'ETHREGION32', 'ETHREGION33', 'ETHREGION34', 'ETHREGION35',
    'ETHREGION36', 'ETHREGION37', 'ETHREGION38', 'ETHREGION39', 'ETHREGION40',
    'ETHREGION41', 'ETHREGION42', 'ETHREGION43', 'ETHREGION44', 'ETHREGION45',
    'ETHREGION46', 'ETHREGION47', 'ETHREGION48', 'ETHREGION49', 'ETHREGION50',
    'ETHREGION51', 'ETHREGION52', 'ETHREGION53', 'ETHREGION54', 'ETHREGION55',
    'ETHREGION56', 'ETHREGION57', 'ETHREGION58', 'ETHREGION59', 'ETHREGION60',
    'ETHREGION61', 'ETHREGION62', 'ETHREGION63', 'ETHREGION64', 'ETHREGION65',
    'ETHREGION66', 'ETHREGION67', 'ETHREGION68', 'ETHREGION69', 'ETHREGION70',
    'ETHREGION71', 'ETHREGION72', 'ETHREGION73', 'ETHREGION74', 'ETHREGION75',
    'ETHREGION76', 'ETHREGION77', 'ETHREGION78', 'ETHREGION79', 'ETHREGION80',
    'ETHREGION81', 'ETHREGION82', 'ETHREGION83', 'ETHREGION84', 'ETHREGION85',
    'ETHREGION86', 'ETHREGION87', 'ETHREGION88', 'ETHREGION89', 'ETHREGION90',
    'ETHREGION91', 'ETHREGION92', 'ETHREGION93', 'ETHREGION94', 'ETHREGION95',
    'HISPANIC', 'SIBS',
    'MARITAL', 'WIDOWED', 'DIVORCE', 'MARTYPE', 'POSSLQ', 'POSSLQY', 'MARCOHAB',
    'CHILDS', 'AGEKDBRN',
    'REGION', 'XNORCSIZ', 'SRCBELT', 'SIZE',
    'EDUC', 'DEGREE', 'MAJOR1', 'MAJOR2', 'DIPGED',
    'RELIG', 'DENOM', 'OTHER', 'JEW', 'RELITEN', 'RELITENV', 'RELITENNV',
    'FUND', 'RELTRAD', 'NORELGSP',
    'INCOME', 'INCOME16', 'CONINC', 'REALINC',
    'USCITZN', 'USCITZNV', 'USCITZNNV', 'FUCITZN', 'FUCITZNV', 'FUCITZNNV',
    'MNTHSUSA', 'YEARSUSA',
    'OTHLANG', 'OTHLANG1', 'OTHLANG2', 'BETRLANG', 'SPKLANG',
    'SEXORNT',
    
    # Work/employment data
    'WRKSTAT', 'EVWORK', 'WRKSLF', 'WRKSLFFAM', 'WHATSLF2',
    'INDUS10', 'WRKGOVT1', 'WRKGOVT2', 'NUMEMPS',
    'HRS1', 'HRS2', 'WEEKSWRK', 'PARTFULL',
    'OCC10', 'PRESTG10', 'PRESTG105PLUS', 'SEI10', 'SEI10EDUC', 'SEI10INC',
    'RINCOME', 'RINCOM16', 'CONRINC', 'REALRINC',
    'LOCALNUM',
    'WKSUB', 'WKSUBS', 'WKSUP', 'WKSUPS', 'WKSUB1', 'WKSUBS1',
    'WKSUP1', 'WKSUPS1', 'YOUSUP', 'SPWKSUP', 'COWKSUP',
    
    # Military data
    'VETYEARS', 'VIETDRAFT',
    
    # Spouse/partner data
    'SPEDUC', 'SPDEG', 'COEDUC', 'CODEG',
    'SPWRKSTA', 'SPEVWORK', 'SPWRKSLF', 'SPWRKSLFFAM', 'SPWRKSLF2', 'WHATSP2',
    'COWRKSTA', 'COEVWORK', 'COWRKSLF', 'COWRKSLFFAM', 'COWRKSLF2', 'WHATCO2',
    'SPIND10', 'COIND10',
    'SPHRS1', 'SPHRS2', 'COHRS1', 'COHRS2',
    'SPOCC10', 'COOCC10',
    'SPPRES10', 'SPPRES105PLUS', 'SPSEI10', 'SPSEI10EDUC', 'SPSEI10INC',
    'COPRES10', 'COPRES105PLUS', 'COSEI10', 'COSEI10EDUC', 'COSEI10INC',
    'SPREL', 'SPDEN', 'SPOTHER', 'SPJEW', 'SPFUND', 'SPNORELGSP',
    'COREL', 'CODEN', 'COOTHER', 'COJEW', 'COFUND', 'CONORELGSP',
    
    # Parental/social origin data
    'BORN', 'INCOM16', 'FAMILY16', 'FAMILY16SEX', 'FAMDIF16', 'FAMDIF16Y',
    'PAEDUC', 'PADEG', 'MAEDUC', 'MADEG',
    'PAWRKSLF', 'PAWRKSLF2', 'PAWRKSLFFAM', 'WHATPA2', 'PAWRKGRW',
    'PAOCC10', 'PAIND10',
    'PAPRES10', 'PAPRES105PLUS', 'PASEI10', 'PASEI10EDUC', 'PASEI10INC',
    'MAWRKGRW', 'MAWRKSLF', 'MAWRKSLF2', 'MAWRKSLFFAM', 'WHATMA2',
    'MAOCC10', 'MAIND10',
    'MAPRES10', 'MAPRES105PLUS', 'MASEI10', 'MASEI10EDUC', 'MASEI10INC',
    'MABORN', 'PABORN', 'PARBORN', 'GRANBORN',
    'REG16', 'RES16', 'MOBILE16',
    'RELIG16', 'DENOM16', 'OTH16', 'JEW16', 'FUND16', 'NORELGSP16',
    
    # Household variables
    'HOMPOP', 'BABIES', 'PRETEEN', 'TEENS', 'ADULTS', 'UNRELAT',
    'ADULTSINHH', 'CHILDSINHH',
    'RELATE1', 'RELATE2', 'RELATE3', 'RELATE4', 'RELATE5', 'RELATE6',
    'RELATE7', 'RELATE8', 'RELATE9', 'RELATE10', 'RELATE11', 'RELATE12',
    'RELATE13', 'RELATE14',
    'RELHHD1', 'RELHHD2', 'RELHHD3', 'RELHHD4', 'RELHHD5', 'RELHHD6',
    'RELHHD7', 'RELHHD8', 'RELHHD9', 'RELHHD10', 'RELHHD11', 'RELHHD12',
    'RELHHD13', 'RELHHD14',
    'RELHH1', 'RELHH2', 'RELHH3', 'RELHH4', 'RELHH5', 'RELHH6',
    'RELHH7', 'RELHH8', 'RELHH9', 'RELHH10', 'RELHH11', 'RELHH12',
    'RELHH13', 'RELHH14',
    'RELSP1', 'RELSP2', 'RELSP3', 'RELSP4', 'RELSP5', 'RELSP6',
    'RELSP7', 'RELSP8', 'RELSP9', 'RELSP10', 'RELSP11', 'RELSP12',
    'RELSP13', 'RELSP14',
    'GENDER1', 'GENDER2', 'GENDER3', 'GENDER4', 'GENDER5', 'GENDER6',
    'GENDER7', 'GENDER8', 'GENDER9', 'GENDER10', 'GENDER11', 'GENDER12',
    'GENDER13', 'GENDER14',
    'OLD1', 'OLD2', 'OLD3', 'OLD4', 'OLD5', 'OLD6',
    'OLD7', 'OLD8', 'OLD9', 'OLD10', 'OLD11', 'OLD12',
    'OLD13', 'OLD14',
    'MAR1', 'MAR2', 'MAR3', 'MAR4', 'MAR5', 'MAR6',
    'MAR7', 'MAR8', 'MAR9', 'MAR10', 'MAR11', 'MAR12',
    'MAR13', 'MAR14',
    'AWAY1', 'AWAY2', 'AWAY3', 'AWAY4', 'AWAY5', 'AWAY6',
    'AWAY7', 'AWAY8', 'AWAY9', 'AWAY10', 'AWAY11', 'AWAY12',
    'AWAY13', 'AWAY14',
    'WHERE1', 'WHERE2', 'WHERE3', 'WHERE4', 'WHERE5', 'WHERE6',
    'WHERE7', 'WHERE8', 'WHERE9', 'WHERE10', 'WHERE11', 'WHERE12',
    'WHERE13', 'WHERE14',
    'HHRACE', 'HHTYPE', 'HHTYPE1', 'FAMGEN', 'RPLACE',
    'VISITORS', 'RVISITOR', 'EARNRS', 'DWELLING',
    'HOMPOP_EXP', 'BABIES_EXP', 'PRETEEN_EXP', 'TEENS_EXP', 'ADULTS_EXP', 'CHILDS_EXP',
    'HHTYPE1_EXP', 'FAMGEN_EXP',
    
    # Racial composition of neighborhood
    'RACLIVE',
    
    # Technical data and paradata
    'HLTHSTRT', 'HUADD', 'HUADDWHY', 'DWELLPRE', 'KIDSINHH', 'RESPOND',
    'INCUSPOP', 'NEISAFE',
    'RLOOKS', 'RGROOMED', 'RWEIGHT', 'RHLTHEND', 'HUCLEAN', 'COOP', 'COMPREND',
    'CONSENT', 'ADMINCONSENT',
    'WHOELSE1', 'WHOELSE2', 'WHOELSE3', 'WHOELSE4', 'WHOELSE5', 'WHOELSE6',
    'LNGTHINV', 'INTAGE', 'INTETHN', 'INTSEX', 'INTYRS', 'INTHISP',
    'INTRACE1', 'INTRACE2', 'INTRACE3',
    'RATETONE', 'PERLA',
    'MODE', 'MODESEQUENCE',
    'FEEUSED', 'FEELEVEL', 'TOTALINCENTIVE',
    'SPANENG', 'SPANINT', 'SPANSELF',
    'FORMWT', 'OVERSAMP', 'WTSS', 'WTSSALL', 'WTSSNR', 'WTSSPS', 'WTSSNRPS',
    'BALLOTFORMWT', 'BALLOTFORMWTNR', 'WTSSPS_AS', 'WTSSNRPS_AS',
    'ID', 'ANESID', 'HEFINFO', 'HEFINFO1',
    'RESPNUM', 'RESPNUMH',
    'YEAR', 'DATEINTV', 'BALLOT', 'VERSION', 'FORM',
    'ISSP', 'PHONE', 'SAMPLE', 'SAMPCODE', 'PHASE', 'SUBSAMPRATE', 'BATCH',
    'KISH', 'VPSU', 'VSTRAT', 'FILEVERSION', 'DEVTYPE',
    
    # Attitudes and Opinions - Priorities for national spending
    'NATSPAC', 'NATSPACY', 'NATENVIR', 'NATENVIRY', 'NATHEAL', 'NATHEALY',
    'NATCITY', 'NATCITYY', 'NATCRIME', 'NATCRIMEY', 'NATDRUG', 'NATDRUGY',
    'NATEDUC', 'NATEDUCY', 'NATRACE', 'NATRACEY', 'NATARMS', 'NATARMSY',
    'NATAID', 'NATAIDY', 'NATFARE', 'NATFAREY', 'NATROAD', 'NATSOC',
    'NATMASS', 'NATPARK', 'NATCHLD', 'NATSCI', 'NATENRGY',
    
    # Voting and political preferences
    'VOTE16', 'VOTE20', 'PRES16', 'PRES20', 'IF16WHO', 'IF20WHO',
    'POLVIEWS', 'PARTYID',
    
    # Religious attitudes and behavior
    'ATTEND', 'PRAY'
]
core_vars_superset = [v.lower() for v in core_vars_superset]


In [4]:
data = spss_to_dict("./2024_spss/2024/GSS2024.sav", verbose=True)

[cols processed]: 100%|██████████| 813/813 [00:00<00:00, 853.42it/s]


In [51]:
core_vars = {var for var in data.keys() if var.lower() in core_vars_superset}

In [23]:
print(f"{len(data)} questions.")
for k, v in data.items():
    print(f"{col(len(v['responses']), Fore.LIGHTWHITE_EX)} responses for {col(k, Fore.LIGHTRED_EX)}: {col(v['question'], Fore.LIGHTGREEN_EX)}")
    print(f"\tSupport: {" ".join([f"{col(c, Fore.LIGHTRED_EX)}) {col(w, Fore.LIGHTGREEN_EX)};" for c, w in v['support'].items()]) if v['support'] is not None and len(v['support']) > 0 else col('No support', Fore.LIGHTYELLOW_EX)}")
    print(f"\tType: {col(v['type'], Fore.LIGHTBLUE_EX)}")
    print("")

813 questions.
[97m3309[0m responses for [91mfileversion[0m: [92mgss data file version[0m
	Support: [91m7222.1[0m) [92mGSS 1972-2022 Release 1 (May 2023)[0m; [91m7222.2[0m) [92mGSS 1972-2022 Release 2 (November 2023)[0m; [91m7222.21[0m) [92mGSS 1972-2022 Release 2a (January 2024)[0m; [91m7222.3[0m) [92mGSS 1972-2022 Release 3 (April 2024)[0m; [91m7222.31[0m) [92mGSS 1972-2022 Release 3a (April 2024)[0m; [91m7222.4[0m) [92mGSS 1972-2022 Release 4 (November 2024)[0m; [91m7224.1[0m) [92mGSS 1972-2024 Release 1 (May 2025)[0m; [91m7224.2[0m) [92mGSS 1972-2024 Release 2 (October 2025)[0m;
	Type: [94mcategorical[0m

[97m3309[0m responses for [91myear[0m: [92mgss year for this respondent[0m
	Support: [93mNo support[0m
	Type: [94mscalar[0m

[97m3309[0m responses for [91mid[0m: [92mrespondent id number[0m
	Support: [93mNo support[0m
	Type: [94mscalar[0m

[97m3309[0m responses for [91mabany[0m: [92mabortion if woman wants for any re

In [31]:
def inspect_question(code: str, resp_id: int) -> None:
    assert code in data, f"Question code {code} not found."
    qst_data = data[code]
    assert 0 <= resp_id < len(qst_data["responses"]), f"Response ID {resp_id} out of range."
    print(f"Question: {col(qst_data['question'], Fore.LIGHTGREEN_EX)}")
    print(f"Response: {col(qst_data['responses'][resp_id], Fore.LIGHTGREEN_EX)} ({col(qst_data['type'], Fore.LIGHTBLUE_EX)})")
    print(f"In core: {col(code in core_vars, Fore.LIGHTGREEN_EX)}")
    if qst_data['support']:
        print(f"Support:")
        for code, label in qst_data['support'].items():
            print(f"\t{col(code, Fore.LIGHTRED_EX)}: {col(label, Fore.LIGHTGREEN_EX)}")

interact(inspect_question, code=data.keys(), resp_id=(0, 1000, 1));

interactive(children=(Dropdown(description='code', options=('fileversion', 'year', 'id', 'abany', 'abanyg', 'a…

In [32]:
def inspect_person(resp_id: int, verbose: bool, in_core: bool) -> None:
    assert 0 <= resp_id < len(next(iter(data.values()))["responses"]), f"Response ID {resp_id} out of range."
    count = 0
    for qst_code in data.keys():
        qst_data = data[qst_code]
        resp = qst_data['responses'][resp_id]
        if resp is not None and (not in_core or qst_code in core_vars):
            count += 1
            print(f"{count}) {col(qst_code, Fore.LIGHTRED_EX)}: {col(qst_data['question'], Fore.LIGHTGREEN_EX)}")
            print(f"\tResponse: {col(resp, Fore.LIGHTGREEN_EX)} ({col(qst_data['type'], Fore.LIGHTBLUE_EX)})")
            if qst_data['support'] and verbose:
                print(f"\tSupport:")
                for code, label in qst_data['support'].items():
                    print(f"\t\t{col(code, Fore.LIGHTRED_EX)}: {col(label, Fore.LIGHTGREEN_EX)}")
            print("")

    print(f"Included {count} questions.")

interact(inspect_person, resp_id=(0, 1000, 1), verbose=False, in_core=False);

interactive(children=(IntSlider(value=500, description='resp_id', max=1000), Checkbox(value=False, description…

In [56]:
def infer_race(row: dict[str, Any]) -> str | None:
    race_mapping = {
        'raceacs1': 'White',
        'raceacs2': 'Black or African American', 
        'raceacs3': 'American Indian or Alaska Native',
        'raceacs4': 'Asian Indian',
        'raceacs5': 'Chinese',
        'raceacs6': 'Filipino',
        'raceacs7': 'Japanese',
        'raceacs8': 'Korean',
        'raceacs9': 'Vietnamese',
        'raceacs10': 'Other Asian',
        'raceacs14': 'Native Hawaiian or Other Pacific Islander',
        'raceacs15': 'Some Other Race',
        'raceacs16': 'Hispanic or Latino'
    }
    
    identities = []
    for var, label in race_mapping.items():
        if var in row and row[var] == "yes":
            identities.append(label.lower())
    
    if not identities:
        return None
    elif len(identities) == 1:
        return identities[0]
    else:
        if len(identities) == 2:
            return f"{identities[0]}, and {identities[1]}"
        else:
            return ", ".join(identities[:-1]) + f", and {identities[-1]}"

In [59]:
demographic_vars = [
    'age',       # age of respondent
    'sex',       # respondent's sex  
    'race',      # race of respondent
    'relig',     # respondent's religious preference
    'marital',   # marital status
    'wrkstat',   # labor force status
    'polviews',  # think of self as liberal or conservative
    'born',
    'year',
    'educ'
]

def inspect_demographics(resp_id: int, verbose: bool) -> None:
    assert 0 <= resp_id < len(next(iter(data.values()))["responses"]), f"Response ID {resp_id} out of range."
    count = 0
    for qst_code in demographic_vars:
        qst_data = data[qst_code]
        resp = qst_data['responses'][resp_id]
        supp = qst_data['support']
        if qst_code == "race":
            resp = infer_race({k.lower(): v for k, v in zip(data.keys(), [data[k]['responses'][resp_id] for k in data.keys()])})
            supp = None
        if resp is None:
            resp = "not specified"
        print(f"{col(qst_code, Fore.LIGHTRED_EX)}: {col(qst_data['question'], Fore.LIGHTGREEN_EX)}")
        print(f"\tResponse: {col(resp, Fore.LIGHTGREEN_EX)} ({col(qst_data['type'], Fore.LIGHTBLUE_EX)})")
        if supp and verbose:
            print(f"\tSupport:")
            for code, label in supp.items():
                print(f"\t\t{col(code, Fore.LIGHTRED_EX)}: {col(label, Fore.LIGHTGREEN_EX)}")
        print("")

interact(inspect_demographics, resp_id=(0, 1000, 1), verbose=False);

interactive(children=(IntSlider(value=500, description='resp_id', max=1000), Checkbox(value=False, description…

*TODO*
1. Identify which questions belong to GSS Core module and which ones do not.
2. Automate production of a "scratch.json" for each respondent.