In [1]:
import pandas as pd

# Step 1: Load the Excel file using the openpyxl engine
df = pd.read_excel(
    "Updated Population Data.xlsx",
    sheet_name="Bevölkerung historics_per BL",
)

year_rows = df[df['Altersjahre'].str.contains("31.12", na=False)].index

all_years = []

for i in range(len(year_rows)):
    year_row_idx = year_rows[i]
    start_idx = year_row_idx + 1
    end_idx = year_rows[i + 1] if i + 1 < len(year_rows) else df.shape[0]

    year = int(df.loc[year_row_idx, 'Altersjahre'][-4:])

    year_df = df.iloc[start_idx:end_idx].copy()
    year_df['Year'] = year

    # Separate the "Insgesamt" row (total) for this year
    total_row = year_df[year_df['Altersjahre'].str.contains("Insgesamt", na=False)]

    # Filter out total row for age processing
    age_rows = year_df[~year_df['Altersjahre'].str.contains("Insgesamt", na=False)]

    # Parse Age function
    def parse_age(val):
        if pd.isna(val):
            return None
        if 'unter 1 Jahr' in val:
            return 0
        if 'mehr' in val:  # "90 Jahre und mehr"
            return 90
        if 'Insgesamt' in val:
            return 'Total'
        try:
            # Extract leading digits for age
            age_num = int(''.join(filter(str.isdigit, val.split(' ')[0])))
            return age_num
        except:
            return None

    age_rows['Age'] = age_rows['Altersjahre'].apply(parse_age)
    age_rows = age_rows[age_rows['Age'].notnull()]

    # Melt to long format for ages
    age_long = age_rows.drop(columns=['Altersjahre']).melt(
        id_vars=['Year', 'Age'], var_name='State', value_name='Population'
    )

    # Melt total row to long format for totals
    total_long = total_row.drop(columns=['Altersjahre']).melt(
        id_vars=['Year'], var_name='State', value_name='Total_Population'
    )
    # Add Age column with 'Total' label
    total_long['Age'] = 'Total'

    # Calculate total population by year and age (excluding total rows)
    total_by_age_year = (
        age_long.groupby(['Year', 'Age'])['Population'].sum().reset_index()
    ).rename(columns={'Population': 'Total_Pop'})

    # Merge to calculate shares for age groups
    age_long = age_long.merge(total_by_age_year, on=['Year', 'Age'])
    age_long['State_Share'] = age_long['Population'] / age_long['Total_Pop']

    # Calculate total population by year (all ages) for all states
    total_by_year = total_long.groupby(['Year'])['Total_Population'].sum().reset_index()

    # Merge total_long with total_by_year to get total share per state/year
    total_long = total_long.merge(total_by_year, on='Year', suffixes=('', '_Total'))
    total_long['State_Share'] = total_long['Total_Population'] / total_long['Total_Population_Total']

    # Rename columns to match for concat
    total_long = total_long.rename(columns={'Total_Population': 'Population'})

    # Reorder columns for total_long to match age_long
    total_long = total_long[['Year', 'Age', 'State', 'Population', 'State_Share']]

    # Concatenate age_long and total_long
    combined_df = pd.concat([age_long[['Year', 'Age', 'State', 'Population', 'State_Share']], total_long], ignore_index=True)

    all_years.append(combined_df)

# Combine all years
final_df = pd.concat(all_years, ignore_index=True)

print(final_df.head(20))

    Year Age              State  Population  State_Share
0   2010   0  Baden-Württemberg     90868.0     0.133978
1   2010   1  Baden-Württemberg     90592.0     0.135088
2   2010   2  Baden-Württemberg     93067.0     0.134857
3   2010   3  Baden-Württemberg     94108.0     0.136154
4   2010   4  Baden-Württemberg     93277.0     0.137381
5   2010   5  Baden-Württemberg     95374.0     0.138123
6   2010   6  Baden-Württemberg     97832.0     0.138226
7   2010   7  Baden-Württemberg     98381.0     0.138841
8   2010   8  Baden-Württemberg    100629.0     0.139379
9   2010   9  Baden-Württemberg    103341.0     0.139742
10  2010  10  Baden-Württemberg    108675.0     0.140529
11  2010  11  Baden-Württemberg    110591.0     0.142626
12  2010  12  Baden-Württemberg    113667.0     0.143459
13  2010  13  Baden-Württemberg    118377.0     0.144694
14  2010  14  Baden-Württemberg    116614.0     0.144939
15  2010  15  Baden-Württemberg    114999.0     0.147233
16  2010  16  Baden-Württemberg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_rows['Age'] = age_rows['Altersjahre'].apply(parse_age)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_rows['Age'] = age_rows['Altersjahre'].apply(parse_age)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_rows['Age'] = age_rows['Altersjahre'].apply(parse_age)
A value is trying to be set 

In [4]:
def age_sort_key(age):
    return 999 if age == 'Total' else int(age)

final_df['Age_sort'] = final_df['Age'].apply(age_sort_key)
final_df = final_df.sort_values(['Year', 'Age_sort']).drop(columns='Age_sort')

# Pivot: wide format for population counts
population_df = final_df.pivot_table(
    index=['Year', 'Age'],
    columns='State',
    values='Population'
).reset_index()

# Pivot: wide format for state shares
state_share_df = final_df.pivot_table(
    index=['Year', 'Age'],
    columns='State',
    values='State_Share'
).reset_index()

In [5]:
print(state_share_df.head())

State  Year Age  Baden-Württemberg    Bayern    Berlin  Brandenburg    Bremen  \
0      2010   0           0.133978  0.156392  0.048909     0.028173  0.008134   
1      2010   1           0.135088  0.156870  0.047958     0.028475  0.008042   
2      2010   2           0.134857  0.156379  0.046163     0.028663  0.007839   
3      2010   3           0.136154  0.157333  0.044671     0.028463  0.007863   
4      2010   4           0.137381  0.157556  0.043026     0.028079  0.007862   

State   Hamburg    Hessen  Mecklenburg-Vorpommern  Niedersachsen  \
0      0.025192  0.076282                0.019647       0.093172   
1      0.024267  0.076124                0.019467       0.094011   
2      0.023483  0.075686                0.019068       0.095372   
3      0.023392  0.076661                0.018565       0.096080   
4      0.022606  0.075979                0.018527       0.097709   

State  Nordrhein-Westfalen  Rheinland-Pfalz  Saarland   Sachsen  \
0                 0.216653         0.

In [6]:
state_share_df.to_csv("cleaned_population_data_long.csv", index=False , encoding='utf-8-sig') 

In [None]:
import pandas as pd

# Load your Excel file
df = pd.read_excel('final_with_insgesamt_per_block.xlsx')  # Replace with your filename

# Define a function to convert the values
def convert_value(val):
    if val == "unter 1 Jahr":
        return 0
    elif val == "Insgesamt":
        return "Total"
    elif val == "90 Jahre und mehr":
        return 90
    else:
        # For entries like "1-Jährige", "2-Jährige", etc.
        # Extract the number part and convert to int
        # Remove "-Jährige"
        if isinstance(val, str) and val.endswith("-Jährige"):
            num_str = val.replace("-Jährige", "")
            try:
                return int(num_str)
            except ValueError:
                return val  # If conversion fails, keep original
        else:
            return val  # If format is unexpected, keep original

# Apply the function to the third column (index 2)
df.iloc[:, 2] = df.iloc[:, 2].apply(convert_value)



In [9]:
df.columns = ['Variant', 'Variant Description', 'Age'] + list(df.columns[3:])
# Save the updated dataframe back to Excel
df.to_excel('Forecast.xlsx', index=False)
