In [4]:
import unicodedata

def normalize_chars(text):
    # Normalize to NFKD form first (separate accents from characters)
    normalized = unicodedata.normalize('NFKD', text)
    # Remove all non-ASCII characters (which includes the separated accents)
    ascii_text = normalized.encode('ASCII', 'ignore').decode('ASCII')
    return ascii_text

In [14]:
def clean_up(text):
    text = text.replace(" (Islamic Republic of)", "")
    text = text.replace(" (under UNSC res. 1244)", "")
    text = text.replace(" (Plurinational State of)", "")
    text = text.replace(" (Fed. States of)", "")
    text = text.replace(" (Bolivarian Republic of)", "")
    text = text.replace(" (Malvinas)", "")
    text = text.replace(" (French part)", "")
    text = text.replace(" (Dutch part)", "")
    return text

In [15]:
import pandas as pd

df = pd.read_csv("life_expectancy.csv")
df = df.rename(columns={column: column.lower().replace(" ", "_") for column in df.columns.tolist()})
df.loc[:, "country"] = df.country.apply(normalize_chars)
df.loc[:, "country"] = df.country.apply(clean_up)
df.head()

Unnamed: 0,country,birth_year,male_expectancy,female_expectancy
0,Burundi,1950,39.6,42.2
1,Burundi,1951,39.8,42.6
2,Burundi,1952,40.0,42.9
3,Burundi,1953,40.3,43.1
4,Burundi,1954,40.5,43.4


In [16]:
df.shape

(19388, 4)

In [23]:
# Extract min and max birth years
birth_min = df.birth_year.min()
birth_max = df.birth_year.max()

# Get unique countries for the COUNTRIES table
countries = df.country.unique().tolist()

# Start building the Lua content
content = f"MIN_BIRTH_YEAR = {birth_min}" + "\n"
content += f"MAX_BIRTH_YEAR = {birth_max}" + "\n"
content += "COUNTRIES = {\n" + ",\n".join([f'  "{country}"' for country in countries]) + "\n}\n"

# Build the LIFE_EXPECTANCY table structure
content += "LIFE_EXPECTANCY = {\n"

# Group by country to create the nested structure
for country in countries:
    content += f'  ["{country}"] = {{\n'
    
    # Filter data for this country
    country_data = df[df.country == country]
    
    # Create entries for each birth year
    for _, row in country_data.iterrows():
        birth_year = int(row.birth_year)
        male_exp = row.male_expectancy
        female_exp = row.female_expectancy
        
        content += f'    [{birth_year}] = {{\n'
        content += f'      ["Male"] = {male_exp},\n'
        content += f'      ["Female"] = {female_exp}\n'
        content += '    },\n'
    
    content += '  },\n'

content += "}\n"

# Write to file
with open("../source/constants.lua", "w") as file:
    file.write(content)