In [76]:
from pathlib import Path

import polars as pl

In [77]:
data = pl.read_csv(
    Path("~/Data/ethiopia/ethiopia_data.csv"),
    columns=[
        "age",
        "sex",
        "dist_road",
        "dist_market",
        "dist_border",
        "dist_popcenter",
        "dist_admhq",
        "denomination",
        "total_cons_ann",
        "nom_totcons_aeq",
        "walls_material",
        "roof_material",
        "floor_material",
    ],
)
# note that hh structure is being ignored

# fill missing numeric values with column means
means = {
    col: data[col].mean() for col in data.select(pl.col(pl.Float64)).columns
}
data = data.with_columns(
    [pl.col(col).fill_null(means[col]) for col in means.keys()]
)

# fill missing categorical values with column modes
modes = {
    col: data[col].mode()[0] for col in data.select(pl.col(pl.Utf8)).columns
}
data = data.with_columns(
    [pl.col(col).fill_null(modes[col]) for col in modes.keys()]
)

data.head()

dist_road,dist_market,dist_border,dist_popcenter,dist_admhq,age,sex,denomination,total_cons_ann,nom_totcons_aeq,walls_material,roof_material,floor_material
f64,f64,f64,f64,f64,str,str,str,f64,f64,str,str,str
7.7,162.3,82.9,0.4,0.0,"""65+""","""Female""","""urban""",226020.0,144884.625,"""Plastered hallow blocks""","""Corrugated iron sheet""","""Plastic tiles"""
7.7,162.3,82.9,0.4,0.0,"""31-50""","""Female""","""urban""",226020.0,144884.625,"""Plastered hallow blocks""","""Corrugated iron sheet""","""Plastic tiles"""
7.7,162.3,82.9,0.4,0.0,"""0-17""","""Female""","""urban""",226020.0,144884.625,"""Plastered hallow blocks""","""Corrugated iron sheet""","""Plastic tiles"""
7.7,162.3,82.9,0.4,0.0,"""31-50""","""Female""","""urban""",248090.0,62967.003906,"""Wood and mud""","""Corrugated iron sheet""","""Plastic tiles"""
7.7,162.3,82.9,0.4,0.0,"""0-17""","""Female""","""urban""",248090.0,62967.003906,"""Wood and mud""","""Corrugated iron sheet""","""Plastic tiles"""


In [9]:
write_path = Path("../data/ethiopia/ethiopia_data_cleaned.csv")
write_path.parent.mkdir(parents=True, exist_ok=True)
data.write_csv(Path("../data/ethiopia/ethiopia_data_cleaned.csv"))

In [78]:
data.schema

Schema([('dist_road', Float64),
        ('dist_market', Float64),
        ('dist_border', Float64),
        ('dist_popcenter', Float64),
        ('dist_admhq', Float64),
        ('age', String),
        ('sex', String),
        ('denomination', String),
        ('total_cons_ann', Float64),
        ('nom_totcons_aeq', Float64),
        ('walls_material', String),
        ('roof_material', String),
        ('floor_material', String)])

In [79]:
data.describe()

statistic,dist_road,dist_market,dist_border,dist_popcenter,dist_admhq,age,sex,denomination,total_cons_ann,nom_totcons_aeq,walls_material,roof_material,floor_material
str,f64,f64,f64,f64,f64,str,str,str,f64,f64,str,str,str
"""count""",25374.0,25374.0,25374.0,25374.0,25374.0,"""25374""","""25374""","""25374""",25374.0,25374.0,"""25374""","""25374""","""25374"""
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""0""","""0""","""0""",0.0,0.0,"""0""","""0""","""0"""
"""mean""",7.882257,64.39417,257.565847,30.543598,0.130145,,,,152359.935066,37146.14977,,,
"""std""",12.41916,73.426574,131.77146,37.724576,0.137734,,,,131843.995774,34137.510762,,,
"""min""",0.0,0.4,3.2,0.4,0.0,"""0-17""","""Female""","""rural""",2400.666504,1854.054077,"""Bricks""","""Asbestos""","""Bamboo/Reed"""
"""25%""",0.5,7.4,148.8,5.4,0.0,,,,75397.8125,17490.724609,,,
"""50%""",1.9,39.3,250.3,17.5,0.1,,,,119608.0,27666.363281,,,
"""75%""",9.2,89.6,373.9,38.2,0.2,,,,188906.734375,45183.707031,,,
"""max""",72.8,448.7,496.3,285.1,0.6,"""65+""","""Male""","""urban""",2798544.0,748273.8125,"""Wood and thatch/Wood only""","""Wood and mud""","""Wood planks"""
