In [50]:
all_data = []

for year in years:
    file = f"ACSDT1Y{year}.B27010-Data.csv"
    path = os.path.join(data_dir, file)
    df = pd.read_csv(path, skiprows=1)

    # Detect column format for 18/19/20 to 34 age group
    age_18_34_label = next((label for label in ["18 to 34 years", "19 to 34 years", "20 to 34 years"]
                            if any(label in col for col in df.columns)), None)
    if not age_18_34_label:
        continue  # skip this year if we can't find the age group

    # Build column map for this file
    column_map = {
        f"Estimate!!Total!!{age_18_34_label}": "all_18to34",
        f"Estimate!!Total!!{age_18_34_label}!!With one type of health insurance coverage!!With employer-based health insurance only": "employer_18to34",
        f"Estimate!!Total!!{age_18_34_label}!!With one type of health insurance coverage!!With direct-purchase health insurance only": "direct_18to34",
        f"Estimate!!Total!!{age_18_34_label}!!With one type of health insurance coverage!!With Medicare coverage only": "medicare_18to34",
        f"Estimate!!Total!!{age_18_34_label}!!With one type of health insurance coverage!!With Medicaid/means-tested public coverage only": "medicaid_18to34",
        f"Estimate!!Total!!{age_18_34_label}!!With one type of health insurance coverage!!With TRICARE/military health coverage only": "tricare_18to34",
        f"Estimate!!Total!!{age_18_34_label}!!With one type of health insurance coverage!!With VA Health Care only": "va_18to34",
        f"Estimate!!Total!!{age_18_34_label}!!No health insurance coverage": "none_18to34",
        "Estimate!!Total!!35 to 64 years": "all_35to64",
        "Estimate!!Total!!35 to 64 years!!With one type of health insurance coverage!!With employer-based health insurance only": "employer_35to64",
        "Estimate!!Total!!35 to 64 years!!With one type of health insurance coverage!!With direct-purchase health insurance only": "direct_35to64",
        "Estimate!!Total!!35 to 64 years!!With one type of health insurance coverage!!With Medicare coverage only": "medicare_35to64",
        "Estimate!!Total!!35 to 64 years!!With one type of health insurance coverage!!With Medicaid/means-tested public coverage only": "medicaid_35to64",
        "Estimate!!Total!!35 to 64 years!!With one type of health insurance coverage!!With TRICARE/military health coverage only": "tricare_35to64",
        "Estimate!!Total!!35 to 64 years!!With one type of health insurance coverage!!With VA Health Care only": "va_35to64",
        "Estimate!!Total!!35 to 64 years!!No health insurance coverage": "none_35to64"
    }

    # Only rename columns that are present
    df = df.rename(columns={k: v for k, v in column_map.items() if k in df.columns})

    # Fill in any missing columns with NA
    for v in column_map.values():
        if v not in df.columns:
            df[v] = pd.NA

    # Add metadata
    if "Geographic Area Name" not in df.columns:
        continue
    df["State"] = df["Geographic Area Name"]
    df["year"] = year
    df = df[df["State"].isin(valid_states)]

    df = df[list(column_map.values()) + ["State", "year"]]
    all_data.append(df)

# Combine all years
df_all = pd.concat(all_data, ignore_index=True)

# Ensure all columns are numeric before summing
for col in df_all.columns:
    if col not in ["State", "year"]:
        df_all[col] = pd.to_numeric(df_all[col], errors="coerce")

# Create derived columns
df_all["adult_pop"] = df_all["all_18to34"] + df_all["all_35to64"]
df_all["ins_employer"] = df_all["employer_18to34"] + df_all["employer_35to64"]
df_all["ins_direct"] = df_all["direct_18to34"] + df_all["direct_35to64"]
df_all["ins_medicare"] = df_all["medicare_18to34"] + df_all["medicare_35to64"]
df_all["ins_medicaid"] = df_all["medicaid_18to34"] + df_all["medicaid_35to64"]
df_all["uninsured"] = df_all["none_18to34"] + df_all["none_35to64"]

# Final dataset
df_final = df_all[["State", "year", "adult_pop", "ins_employer", "ins_direct", "ins_medicare", "ins_medicaid", "uninsured"]]
df_final.to_csv("/Users/ellenwu/homework5-1/data/output/final_insurance_cleaned.csv", sep="\t", index=False)