# Analysis on the imputed dataset

# Select and recode the variables

In [1]:
from __future__ import annotations

from pathlib import Path
from typing import Iterator, Tuple

import pandas as pd

# Recreate the SDT subset using the imputed WVS extract so downstream models share the same schema.
PROJECT_ROOT = Path("..").resolve()
DATA_DIR = PROJECT_ROOT / "data"
SOURCE_FILE = DATA_DIR / "WVS_imputed_median.csv"
OUTPUT_FILE = DATA_DIR / "lifesat_sdt_subset_imputed.csv"

TARGET_COLUMN = "Q49"  # Life satisfaction item
TARGET_RENAMED = "LifeSat"

COLUMN_GROUPS: dict[str, dict[str, str]] = {
    "com": {
        "Q47": "SHealth",
        "Q50": "FinSat",
        "Q56": "FinSat_ComParent",
        "Q142": "RiskUnemployed",
        "Q143": "EducationNextGen",
        # "Q275": "Education",
		# "Q279": "PaidEmployment",
		# "Q281": "OccupationalGroup",
		# "Q285": "ChiefWageEarner",
		# "Q286": "FamFin",
		# "Q287": "SocialStatus",
		# "Q288": "HouseholdSocialStatus",
    },
    "aut": {
        "Q48": "FreeChoice",
        "Q131": "Security",
        "Q146": "PublicSecurity_War",
        "Q147": "PublicSecurity_Terrorism",
        "Q148": "PublicSecurity_CivilWar",
        "Q251": "Democracy",
        "Q253": "HumanRights",
    },
    "rel": {
        "Q57": "Trust",
        "Q58": "Trust_Family",
        "Q59": "Trust_Neighbors",
        "Q60": "Trust_Acquaintances",
        "Q61": "Trust_Strangers",
        "Q62": "Trust_OtherReligion",
        "Q63": "Trust_OtherNationality",
        "Q94": "Membership_Religious",
        "Q95": "Membership_Sport",
        "Q96": "Membership_Art",
        "Q97": "Membership_LaborUnion",
        "Q98": "Membership_Political",
        "Q99": "Membership_Environmental",
        "Q100": "Membership_Professional",
        "Q101": "Membership_Charity",
        "Q102": "Membership_Consumer",
        "Q103": "Membership_SelfHelp",
        "Q104": "Membership_Women",
        "Q105": "Membership_Other",
        "Q164": "GodImportance",
        "Q171": "ReligiousAttendance",
        "Q172": "Pray",
        "Q254": "NationalPride",
        "Q255": "CloseToTown",
        "Q256": "CloseToRegion",
        "Q257": "CloseToCountry",
        "Q258": "CloseToContinent",
        "Q259": "CloseToWorld",
		# "Q269": "Citizenship",
		# "Q270": "NFamilyMembers",
		# "Q274": "NChildren"
    },
}

def column_pairs() -> Iterator[Tuple[str, str]]:
    for mapping in COLUMN_GROUPS.values():
        for source_column, renamed in mapping.items():
            yield source_column, renamed

predictor_pairs = list(column_pairs())
required_columns = [TARGET_COLUMN, *(source for source, _ in predictor_pairs)]

frame = pd.read_csv(SOURCE_FILE, usecols=required_columns)
rename_map = {TARGET_COLUMN: TARGET_RENAMED}
rename_map.update({source: renamed for source, renamed in predictor_pairs})
frame = frame.rename(columns=rename_map)
ordered_columns = [TARGET_RENAMED, *(renamed for _, renamed in predictor_pairs)]
frame = frame[ordered_columns]

frame.to_csv(OUTPUT_FILE, index=False)
print(f"Saved imputed SDT subset with {frame.shape[0]} rows to {OUTPUT_FILE.name}.")

Saved imputed SDT subset with 95800 rows to lifesat_sdt_subset_imputed.csv.
