In [12]:
import pandas as pd
import os
import json
from rapidfuzz import process

# === Step 1: Paths ===
base_dir = os.getcwd()
input_excel = os.path.join(base_dir, "KNBS_2019_Census_Downloads", "2019-Kenya-population-and-Housing-Census-Population-households-density-by-administrative-units_V3.xlsx")
json_path = os.path.join(base_dir, "Kenya_Admin_Updated_March 2024", "county_subcounty_ward_lookup.json")
output_dir = os.path.join(base_dir, "KNBS_2019_Census_Downloads", "extracted_levels")
os.makedirs(output_dir, exist_ok=True)

# === Step 2: Load data ===
df = pd.read_excel(input_excel)
if "ITEM" not in df.columns:
    raise ValueError("❌ 'ITEM' column not found.")

with open(json_path, "r", encoding="utf-8") as f:
    lookup_json = json.load(f)

# === Step 3: Prepare loop ===
rows = []
current_county = None
current_subcounty = None

def match_name(name, options, threshold=85):
    result = process.extractOne(name, options, score_cutoff=threshold)
    return result[0] if result else name

# === Step 4: Parse rows based on indent level ===
for _, row in df.iterrows():
    raw = str(row["ITEM"]).rstrip()
    indent = len(raw) - len(raw.lstrip())
    name = raw.strip().title()
    
    data = row.drop("ITEM").to_dict()  # remove original label column

    if name.upper() == "KENYA":
        continue  # Skip country row
    
    if indent == 4:
        # County level
        current_county = match_name(name, list(lookup_json.keys()))
        current_subcounty = None
        data.update({
            "County": current_county,
            "SubCounty": None,
            "Ward": None
        })
        rows.append(data)

    elif indent == 8:
        # SubCounty level
        if current_county:
            subcounty_opts = list(lookup_json.get(current_county, {}).keys())
            current_subcounty = match_name(name, subcounty_opts)
            data.update({
                "County": current_county,
                "SubCounty": current_subcounty,
                "Ward": None
            })
            rows.append(data)

    elif indent >= 12:
        # Ward level
        if current_county and current_subcounty:
            ward_opts = lookup_json.get(current_county, {}).get(current_subcounty, [])
            ward_name = match_name(name, ward_opts)
            data.update({
                "County": current_county,
                "SubCounty": current_subcounty,
                "Ward": ward_name
            })
            rows.append(data)

# === Step 5: Save output ===
df_out = pd.DataFrame(rows)

# Reorder for clarity
order = ["County", "SubCounty", "Ward"]
df_out = df_out[[*order] + [col for col in df_out.columns if col not in order]]

output_path = os.path.join(output_dir, "KNBS_2019_census_all_levels.xlsx")
df_out.to_excel(output_path, index=False)

print(f"✅ Parsed {len(df_out)} rows across county, subcounty, and ward levels.")
print(f"📄 Output saved to: {output_path}")


✅ Parsed 14168 rows across county, subcounty, and ward levels.
📄 Output saved to: c:\Users\Rono\Downloads\AAAA Hackathon\Datasets\KNBS_2019_Census_Downloads\extracted_levels\KNBS_2019_census_all_levels.xlsx


In [13]:
import pandas as pd
import os
import json
from rapidfuzz import process

# === Step 1: Paths ===
base_dir = os.getcwd()
input_excel = os.path.join(base_dir, "KNBS_2019_Census_Downloads", "2019-Kenya-population-and-Housing-Census-Population-households-density-by-administrative-units_V3.xlsx")
json_path = os.path.join(base_dir, "Kenya_Admin_Updated_March 2024", "county_subcounty_ward_lookup.json")
output_dir = os.path.join(base_dir, "KNBS_2019_Census_Downloads", "extracted_levels")
os.makedirs(output_dir, exist_ok=True)

# === Step 2: Load data ===
df = pd.read_excel(input_excel)
df.columns = [str(col).strip() for col in df.columns]

if "ITEM" not in df.columns:
    raise ValueError("❌ 'ITEM' column not found.")

with open(json_path, "r", encoding="utf-8") as f:
    lookup_json = json.load(f)

# === Step 3: Prepare loop ===
rows = []
current_county = None
current_subcounty = None

def match_name(name, options, threshold=85):
    result = process.extractOne(name, options, score_cutoff=threshold)
    return result[0] if result else name

# === Step 4: Parse rows based on indent level ===
for _, row in df.iterrows():
    raw = str(row["ITEM"]).rstrip()
    indent = len(raw) - len(raw.lstrip())
    name = raw.strip().title()
    
    data = row.drop("ITEM").to_dict()

    if name.upper() == "KENYA":
        continue  # Skip national row
    
    if indent == 4:
        # County level
        current_county = match_name(name, list(lookup_json.keys()))
        current_subcounty = None
        data.update({
            "County": current_county,
            "SubCounty": None,
            "Ward": None
        })
        rows.append(data)

    elif indent == 8:
        # SubCounty level
        if current_county:
            subcounty_opts = list(lookup_json.get(current_county, {}).keys())
            current_subcounty = match_name(name, subcounty_opts)
            data.update({
                "County": current_county,
                "SubCounty": current_subcounty,
                "Ward": None
            })
            rows.append(data)

    elif indent >= 12:
        # Ward level
        if current_county and current_subcounty:
            ward_opts = lookup_json.get(current_county, {}).get(current_subcounty, [])
            ward_name = match_name(name, ward_opts)
            data.update({
                "County": current_county,
                "SubCounty": current_subcounty,
                "Ward": ward_name
            })
            rows.append(data)

# === Step 5: Create DataFrame ===
df_out = pd.DataFrame(rows)

# === Step 6: Split and save by level ===
# Define standard column order
order = ["County", "SubCounty", "Ward"]

# Split levels
df_county = df_out[df_out["SubCounty"].isna() & df_out["Ward"].isna()]
df_subcounty = df_out[df_out["SubCounty"].notna() & df_out["Ward"].isna()]
df_ward = df_out[df_out["Ward"].notna()]

# Reorder columns
df_county = df_county[[*order] + [col for col in df_county.columns if col not in order]]
df_subcounty = df_subcounty[[*order] + [col for col in df_subcounty.columns if col not in order]]
df_ward = df_ward[[*order] + [col for col in df_ward.columns if col not in order]]

# Save files
df_county.to_excel(os.path.join(output_dir, "KNBS_2019_county_level.xlsx"), index=False)
df_subcounty.to_excel(os.path.join(output_dir, "KNBS_2019_subcounty_level.xlsx"), index=False)
df_ward.to_excel(os.path.join(output_dir, "KNBS_2019_ward_level.xlsx"), index=False)

# === Step 7: Print Summary ===
print(f"✅ Parsed {len(df_out)} total rows:")
print(f"   🏛️ County level: {len(df_county)} rows → KNBS_2019_county_level.xlsx")
print(f"   🏙️ SubCounty level: {len(df_subcounty)} rows → KNBS_2019_subcounty_level.xlsx")
print(f"   🌍 Ward level: {len(df_ward)} rows → KNBS_2019_ward_level.xlsx")
print(f"📁 All files saved to: {output_dir}")


✅ Parsed 14168 total rows:
   🏛️ County level: 47 rows → KNBS_2019_county_level.xlsx
   🏙️ SubCounty level: 349 rows → KNBS_2019_subcounty_level.xlsx
   🌍 Ward level: 13772 rows → KNBS_2019_ward_level.xlsx
📁 All files saved to: c:\Users\Rono\Downloads\AAAA Hackathon\Datasets\KNBS_2019_Census_Downloads\extracted_levels
