In [2]:
import os
import pandas as pd
import json
from pathlib import Path
from collections import defaultdict

# ==========================
# 1️⃣ AUTO-DETECT WORKING DIRECTORY
# ==========================
try:
    BASE_DIR = Path(os.path.abspath(__file__)).parent
except NameError:
    BASE_DIR = Path.cwd()  # for interactive use like Jupyter

INPUT_DIR = BASE_DIR / "extracted_excel"
LOOKUP_PATH = BASE_DIR / "question_to_filename_lookup.json"
OUTPUT_DIR = BASE_DIR / "county_excel"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==========================
# 2️⃣ LOAD LOOKUP JSON
# ==========================
with open(LOOKUP_PATH, "r", encoding="utf-8") as f:
    question_lookup = json.load(f)

# ==========================
# 3️⃣ COMBINE BY QUESTION
# ==========================
combined_data = defaultdict(list)
excel_files = list(INPUT_DIR.glob("*_gendered_enterprise.xlsx"))

for file in excel_files:
    try:
        df = pd.read_excel(file)
        county_name = file.stem.split("_")[0]

        for full_question, short_name in question_lookup.items():
            matches = df[df["Question"].str.strip() == full_question.strip()]
            if not matches.empty:
                matches = matches.copy()
                matches["County"] = county_name
                combined_data[short_name].append(matches)

    except Exception as e:
        print(f"❌ Error processing {file.name}: {e}")

# ==========================
# 4️⃣ SAVE OUTPUTS
# ==========================
for short_name, dataframes in combined_data.items():
    combined_df = pd.concat(dataframes, ignore_index=True)
    output_file = OUTPUT_DIR / f"{short_name}.xlsx"
    combined_df.to_excel(output_file, index=False)

print(f"✅ Finished combining. Output saved in: {OUTPUT_DIR}")


❌ Error processing ~$Bomet_gendered_enterprise.xlsx: Excel file format cannot be determined, you must specify an engine manually.
❌ Error processing ~$Bungoma_gendered_enterprise.xlsx: Excel file format cannot be determined, you must specify an engine manually.
❌ Error processing ~$Busia_gendered_enterprise.xlsx: Excel file format cannot be determined, you must specify an engine manually.
❌ Error processing ~$Elgeyo_gendered_enterprise.xlsx: Excel file format cannot be determined, you must specify an engine manually.
✅ Finished combining. Output saved in: d:\AAAA_Data\GENDER\county_excel
