# Data analysis for the Adult data

In [1]:
from pathlib import Path

import pandas as pd
from helpers.utils import bin_hours_per_week, bar_chart

In [2]:
artifacts_dir = Path("artifacts")
data_dir = artifacts_dir / "data" / "adult"

train = pd.read_csv(data_dir / "processed" / "train.csv")
val = pd.read_csv(data_dir / "processed" / "val.csv")
test = pd.read_csv(data_dir / "processed" / "test.csv")

train_oh = pd.read_csv(data_dir / "processed" / "train-one-hot.csv")
val_oh = pd.read_csv(data_dir / "processed" / "val-one-hot.csv")
test_oh = pd.read_csv(data_dir / "processed" / "test-one-hot.csv")

In [3]:
train_oh

Unnamed: 0,age,sex,capital_gain,capital_loss,hours_per_week,salary,workclass_federal_gov,workclass_local_gov,workclass_private,workclass_self_emp_inc,...,marital_status_divorced,marital_status_married_af_spouse,marital_status_married_civ_spouse,marital_status_married_spouse_absent,marital_status_never_married,marital_status_separated,marital_status_widowed,native_country_mexico,native_country_other,native_country_united_states
0,1.108936,1,-0.147741,-0.218133,2.416833,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0.805386,1,-0.147741,-0.218133,2.416833,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,-0.788255,1,-0.147741,-0.218133,-0.079269,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
3,1.640150,0,-0.147741,-0.218133,-0.079269,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,1.108936,1,-0.147741,-0.218133,-0.079269,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24124,0.122397,1,-0.147741,-0.218133,-0.495286,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
24125,0.198285,1,0.263907,-0.218133,-0.079269,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
24126,-0.105266,1,-0.147741,-0.218133,4.829731,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
24127,1.336599,0,-0.147741,-0.218133,-0.079269,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [4]:
salary_by_sex = train[["sex", "salary"]].groupby("sex").mean()

fig_salary_by_sex = bar_chart(
    x=["Female", "Male"],
    y=salary_by_sex.salary,
    title="Proportion of high earners by sex",
    xlabel="Sex",
    ylabel="Proportion of high earners",
)
fig_salary_by_sex

In [5]:
salary_by_race = train[["race", "salary"]].groupby("race").mean()

fig_salary_by_race = bar_chart(
    x=[
        "American Indian / Eskimo",
        "Asian / Pacific Islander",
        "Black",
        "Other",
        "White",
    ],
    y=salary_by_race.salary,
    title="Proportion of high earners by race",
    xlabel="Race",
    ylabel="Proportion of high earners",
)
fig_salary_by_race

In [6]:
salary_by_hours_per_week = (
    val.assign(hpw=val.hours_per_week.map(bin_hours_per_week))
    .loc[:, ["hpw", "salary"]]
    .groupby("hpw")
    .aggregate(["mean", "count"])
    .reset_index()
)

fig_salary_by_hours_per_week = bar_chart(
    x=salary_by_hours_per_week.hpw,
    y=salary_by_hours_per_week["salary"]["mean"],
    title="Proportion of high earners by hours worked per week",
    xlabel="Hours worked per week",
    ylabel="Proportion of high earners",
    xticks={
        "tickvals": [0, 1, 2, 3],
        "ticktext": ["0-30", "30-40", "40-50", "50+"],
    },
)
fig_salary_by_hours_per_week