In [1]:
from typing import Callable, Any

import arff_parser as ap

In [2]:
census = ap.parse_arff_file("adult_train.arff")

census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [3]:
def sum_attr_pred(func: Callable[[Any], bool], attr: str) -> int:
    return sum(func(x) for x in census[attr])

In [4]:
for attr in census.columns:
    print(f"# Of Instances with {attr} = '?': {sum_attr_pred(lambda x: x is None, attr)}")

# Of Instances with age = '?': 0
# Of Instances with workclass = '?': 2503
# Of Instances with fnlwgt = '?': 0
# Of Instances with education = '?': 0
# Of Instances with education-num = '?': 0
# Of Instances with marital-status = '?': 0
# Of Instances with occupation = '?': 2512
# Of Instances with relationship = '?': 0
# Of Instances with race = '?': 0
# Of Instances with sex = '?': 0
# Of Instances with capital-gain = '?': 0
# Of Instances with capital-loss = '?': 0
# Of Instances with hours-per-week = '?': 0
# Of Instances with native-country = '?': 766
# Of Instances with class = '?': 0


In [5]:
cnt = 0
total_rows = 0

for idx, row in census.iterrows():
    if any(x is None for x in row):
        cnt += 1
    total_rows += 1

print(f"{cnt} of total {total_rows} rows contain atleast one unknown (?) Attribute: {100 * (cnt / total_rows)}%")

3236 of total 43958 rows contain atleast one unknown (?) Attribute: 7.361572410027755%


In [12]:
def print_csv_line(file, data):
    file.write(", ".join(map(lambda x: '?' if x is None else str(x), data)))
    file.write('\n')

with open("processed.csv", "w+") as f:
    for idx, row in census.iterrows():
        print_csv_line(f, [
            row['age'],
            row['workclass'],
            row['education-num'],
            1 if row['marital-status'] in ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'] else 0,
            row['occupation'],
            row['relationship'],
            row['race'],
            row['sex'],
            row['capital-gain'],
            row['capital-loss'],
            row['hours-per-week'],
            1 if row['native-country'] == 'United-States' else 0,
            row['class']
        ])

In [14]:
usa_gt_50k = 0
usa_lt_50k = 0
for_gt_50k = 0
for_lt_50k = 0

for idx, row in census.iterrows():
    if row['native-country'] == 'United-States':
        if row['class'] == '>50K':
            usa_gt_50k += 1
        else:
            usa_lt_50k += 1
    else:
        if row['class'] == '>50K':
            for_gt_50k += 1
        else:
            for_lt_50k += 1

print(f"{100 * (usa_gt_50k / (usa_gt_50k + usa_lt_50k))}% of US. Citizens earn more than 50k / year")
print(f"{100 * (for_gt_50k / (for_gt_50k + for_lt_50k))}% of foreign Citizens earn more than 50k / year")

24.413597446679162% of US. Citizens earn more than 50k / year
19.754464285714285% of foreign Citizens earn more than 50k / year
