In [1]:
import numpy as np
import pandas as pd

# --- 1) Read the Data ---
census_df = pd.read_csv("census_data.csv", sep=",")  # keep strings as strings
n_distinct = census_df.iloc[:, 0].nunique(dropna=True)
print("Number of distinct comune ids:", n_distinct)
print("In total, the population for Oberbayern, Niederbayern and Schwaben is:", census_df['weight'].sum())
mask_munich = census_df["commune_id"].astype(str).str.startswith("9162")
total_weight = census_df.loc[mask_munich, "weight"].sum(min_count=1)  # returns NaN if none match
print("There are ", total_weight, " people in Munich")

sex_weights_df = census_df[mask_munich]
age_classes_target_classes = sorted(census_df.loc[mask_munich, "age_class"].unique())
population_2045 = pd.read_csv("population_munich_2045.csv", sep=",", header=None)

Number of distinct comune ids: 1098
In total, the population for Oberbayern, Niederbayern and Schwaben is: 8021702
There are  1512491  people in Munich


In [2]:
# ----------------------------
# REQUIRED INPUTS YOU ALREADY HAVE
# population_2045 : Series or DataFrame with totals per age-group start (0,3,6,10,16,19,25,40,65,75)
# age_classes_target_classes : list of ints, e.g. [0,3,6,10,15,18,20,25,30,40,50,65,75]
# sex_weights_df : DataFrame with columns ['commune_id','sex','age_class','weight'] for Munich only
# ----------------------------

# ----- CONFIG -----
CAP_75PLUS = 99  # finite cap for 75+ so we can compute a per-age value at age 75
INTERVAL_ENDS = {0:2, 3:5, 6:9, 10:15, 16:18, 19:24, 25:39, 40:64, 65:74, 75:76}

# ----- 1) Normalize population_2045 to ['age_start','total_2045'] -----
def normalize_groups(obj):
    if isinstance(obj, pd.Series):
        g = obj.rename("total_2045").reset_index()
        g.columns = ["age_start", "total_2045"]
    elif isinstance(obj, pd.DataFrame):
        if obj.shape[1] == 1:
            g = obj.iloc[:,0].rename("total_2045").reset_index()
            g.columns = ["age_start", "total_2045"]
        else:
            cols = list(obj.columns)
            g = obj[[cols[0], cols[1]]].copy()
            g.columns = ["age_start", "total_2045"]
    else:
        raise TypeError("population_2045 must be a pandas Series or DataFrame.")
    g["age_start"] = g["age_start"].astype(int)
    g["total_2045"] = g["total_2045"].astype(float)
    return g

groups = normalize_groups(population_2045)

# Attach interval ends
groups["age_end"] = groups["age_start"].map(INTERVAL_ENDS)
if groups["age_end"].isna().any():
    missing = groups.loc[groups["age_end"].isna(), "age_start"].tolist()
    raise ValueError(f"Missing INTERVAL_ENDS for starts: {missing}")

In [3]:
# --- inputs you already have: -----------------------------------------------
# groups : DataFrame with columns ['age_start','age_end','total_2045']  (or created via INTERVAL_ENDS mapping)
# age_classes_target_classes : list[int] like [0,3,6,10,15,18,20,25,30,40,50,65,75]
# CAP_75PLUS : int (e.g., 99)
# sex_weights_df : DataFrame with ['sex','age_class','weight'] for Munich only
# ---------------------------------------------------------------------------

# ----- 2) Per-age totals using '/(width)' rule -----   
pop_per_age = {}
for _, r in groups.iterrows():
    lo, hi, total = int(r["age_start"]), int(r["age_end"]), float(r["total_2045"])
    width = hi - lo + 1
    if width <= 0:
        raise ValueError(f"Invalid bin: [{lo},{hi}] (width={width})")
    val = total / width
    for a in range(lo, hi + 1):
        pop_per_age[a] = val

# sanity: bins reconcile
for _, r in groups.iterrows():
    lo, hi, total = int(r["age_start"]), int(r["age_end"]), float(r["total_2045"])
    recon = sum(pop_per_age[a] for a in range(lo, hi + 1))
    if abs(recon - total) > 1e-6:
        raise AssertionError(f"Bin [{lo},{hi}] does not reconcile: {recon} vs {total}")

# 2) Build contiguous class intervals from target ages
targets_sorted = sorted(set(age_classes_target_classes))
intervals = []
for i, a_start in enumerate(targets_sorted):
    a_end = targets_sorted[i + 1] - 1 if i < len(targets_sorted) - 1 else CAP_75PLUS
    intervals.append((a_start, a_end))

# 3) Sum per-age totals within each class
rows = []
for lo, hi in intervals:
    ages_in_range = [a for a in range(lo, hi + 1) if a in pop_per_age]
    total = sum(pop_per_age[a] for a in ages_in_range)
    rows.append({"age": lo, "age_end": hi, "total_2045": int(round(total))})

targets_total = pd.DataFrame(rows).sort_values("age").reset_index(drop=True)

# integrity checks
for i in range(1, len(targets_total)):
    assert targets_total.loc[i, "age"] == targets_total.loc[i-1, "age_end"] + 1, "Class gaps/overlap detected."
sum_classes = targets_total["total_2045"].sum()
sum_per_age = int(round(sum(pop_per_age.values())))
assert abs(sum_classes - sum_per_age) <= 1, f"Totals mismatch: classes={sum_classes}, per_age_sum={sum_per_age}"

# 4) Build sex_dis_per_age (female share per single age) from sex_weights_df
#    a) compute female share at ANCHOR ages (0,3,6,...)
shares_anchor = (
    sex_weights_df[["sex","age_class","weight"]]
      .assign(sex=lambda d: d["sex"].str.lower().str.strip())
      .groupby(["age_class","sex"], as_index=False)["weight"].sum()
      .pivot(index="age_class", columns="sex", values="weight")
      .fillna(0.0)
)

for col in ["female","male"]:
    if col not in shares_anchor.columns:
        shares_anchor[col] = 0.0

shares_anchor["total_w"] = shares_anchor["female"] + shares_anchor["male"]
if (shares_anchor["total_w"] <= 0).any():
    bad = shares_anchor.index[(shares_anchor["total_w"] <= 0)].tolist()
    raise ValueError(f"No weight for anchor age_class(es): {bad}")

shares_anchor["female_share"] = shares_anchor["female"] / shares_anchor["total_w"]
shares_anchor = shares_anchor.reset_index().rename(columns={"age_class":"age"})
anchor_ages = sorted(shares_anchor["age"].tolist())

#    b) expand to every age by piece-wise constant fill between anchors
sex_dis_per_age = {}
for i, a0 in enumerate(anchor_ages):
    a1 = anchor_ages[i+1] - 1 if i < len(anchor_ages) - 1 else CAP_75PLUS
    fshare = float(shares_anchor.loc[shares_anchor["age"] == a0, "female_share"].iloc[0])
    for a in range(a0, a1 + 1):
        sex_dis_per_age[a] = fshare

# ensure coverage for all ages used in pop_per_age
missing = [a for a in pop_per_age.keys() if a not in sex_dis_per_age]
if missing:
    raise AssertionError(f"sex_dis_per_age missing ages: {missing[:10]} ... (total missing {len(missing)})")

# 5) Add predicted_female_share_2045 to targets_total:
#    population-weighted average over ages in the class
pred_shares = []
for _, row in targets_total.iterrows():
    lo, hi = int(row["age"]), int(row["age_end"])
    ages_in_range = [a for a in range(lo, hi + 1) if a in pop_per_age]
    num = sum(pop_per_age[a] * sex_dis_per_age[a] for a in ages_in_range)
    den = sum(pop_per_age[a] for a in ages_in_range)
    pred_share = num / den if den > 0 else 0.0
    pred_shares.append(pred_share)

targets_total["predicted_female_share_2045"] = pred_shares

# Optional: compute female/male counts consistent with shares & totals
targets_total["female_2045"] = (targets_total["total_2045"] * targets_total["predicted_female_share_2045"]).round().astype(int)
targets_total["male_2045"]   = targets_total["total_2045"] - targets_total["female_2045"]

# final integrity
assert (targets_total["female_2045"] + targets_total["male_2045"] == targets_total["total_2045"]).all(), \
       "Female + male != total in some rows"

# Result: targets_total now has:
# ['age','age_end','total_2045','predicted_female_share_2045','female_2045','male_2045']
targets_total

targets_total.rename(columns={"age": "age_start"}, inplace=True)
targets_total.to_csv("processed_pop_munich_2045.csv", header=True, index=False)

In [None]:
targets_total.insert(0, 'commune_id', '091620000000')  # add as first column

In [6]:
targets_total

Unnamed: 0,commune_id,age_start,age_end,total_2045,predicted_female_share_2045,female_2045,male_2045
0,91620000000,0,2,56184,0.487231,27375,28809
1,91620000000,3,5,50839,0.494434,25137,25702
2,91620000000,6,9,62569,0.490813,30710,31859
3,91620000000,10,14,74144,0.488693,36234,37910
4,91620000000,15,17,44982,0.483929,21768,23214
5,91620000000,18,19,38256,0.488594,18692,19564
6,91620000000,20,24,115899,0.493614,57209,58690
7,91620000000,25,29,169305,0.502164,85019,84286
8,91620000000,30,39,338610,0.498094,168660,169950
9,91620000000,40,49,228748,0.512241,117174,111574


In [9]:
type(targets_total["commune_id"][0])

str