In [1]:
from typing import Callable, Any

import arff as ap

In [3]:
census, _ = ap.parse_arff_file("adult_train.arff")

census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [7]:
def sum_attr_pred(func: Callable[[Any], bool], attr: str) -> int:
    return sum(1 if func(x) else 0 for x in census[attr])

In [8]:
for attr in census.columns:
    print(f"# Of Instances with {attr} = '?': {sum_attr_pred(lambda x: x is None, attr)}")

# Of Instances with age = '?': 0
# Of Instances with workclass = '?': 2503
# Of Instances with fnlwgt = '?': 0
# Of Instances with education = '?': 0
# Of Instances with education-num = '?': 0
# Of Instances with marital-status = '?': 0
# Of Instances with occupation = '?': 2512
# Of Instances with relationship = '?': 0
# Of Instances with race = '?': 0
# Of Instances with sex = '?': 0
# Of Instances with capital-gain = '?': 0
# Of Instances with capital-loss = '?': 0
# Of Instances with hours-per-week = '?': 0
# Of Instances with native-country = '?': 766
# Of Instances with class = '?': 0


In [12]:
from statistics import mean, median

n_instances = len(census)
n_instances_capital_loss_zero = sum_attr_pred(lambda x: x == 0, "capital-loss")

print(f"# Of Instances with capital-loss = 0: {n_instances_capital_loss_zero} ({n_instances_capital_loss_zero / n_instances})")
print(f"Average capital-loss: {mean(census['capital-loss'])}")
print(f"Median capital-loss: {median(census['capital-loss'])}")

# Of Instances with capital-loss = 0: 41918 (0.9535920651531007)
Average capital-loss: 87.05452932344511
Median capital-loss: 0.0


In [13]:
n_instances_capital_gain_gt_5k = 0
n_instances_capital_gain_gt_5k_target = 0

for _, row in census.iterrows():
    if row['capital-gain'] > 5000:
        n_instances_capital_gain_gt_5k += 1
        if row['class'] == '>50K':
            n_instances_capital_gain_gt_5k_target += 1

print(f"# Of Instances with capital-gain > 5000: {n_instances_capital_gain_gt_5k} (Total perc: {n_instances_capital_gain_gt_5k / n_instances})")
print(f"# Of Instances with capital-gain > 5000 and class = >50K: {n_instances_capital_gain_gt_5k_target} ({n_instances_capital_gain_gt_5k_target / n_instances_capital_gain_gt_5k} of instances with capital-gain > 5000)")

# Of Instances with capital-gain > 5000: 2232 (Total perc: 0.05077574047954866)
# Of Instances with capital-gain > 5000 and class = >50K: 2010 (0.9005376344086021 of instances with capital-gain > 5000)


In [5]:
cnt = 0
total_rows = 0

for idx, row in census.iterrows():
    if any(x is None for x in row):
        cnt += 1
    total_rows += 1

print(f"{cnt} of total {total_rows} rows contain atleast one unknown (?) Attribute: {100 * (cnt / total_rows)}%")

3236 of total 43958 rows contain atleast one unknown (?) Attribute: 7.361572410027755%


In [12]:
def print_csv_line(file, data):
    file.write(", ".join(map(lambda x: '?' if x is None else str(x), data)))
    file.write('\n')

with open("processed.csv", "w+") as f:
    for idx, row in census.iterrows():
        print_csv_line(f, [
            row['age'],
            row['workclass'],
            row['education-num'],
            1 if row['marital-status'] in ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'] else 0,
            row['occupation'],
            row['relationship'],
            row['race'],
            row['sex'],
            row['capital-gain'],
            row['capital-loss'],
            row['hours-per-week'],
            1 if row['native-country'] == 'United-States' else 0,
            row['class']
        ])

In [14]:
usa_gt_50k = 0
usa_lt_50k = 0
for_gt_50k = 0
for_lt_50k = 0

for idx, row in census.iterrows():
    if row['native-country'] == 'United-States':
        if row['class'] == '>50K':
            usa_gt_50k += 1
        else:
            usa_lt_50k += 1
    else:
        if row['class'] == '>50K':
            for_gt_50k += 1
        else:
            for_lt_50k += 1

print(f"{100 * (usa_gt_50k / (usa_gt_50k + usa_lt_50k))}% of US. Citizens earn more than 50k / year")
print(f"{100 * (for_gt_50k / (for_gt_50k + for_lt_50k))}% of foreign Citizens earn more than 50k / year")

24.413597446679162% of US. Citizens earn more than 50k / year
19.754464285714285% of foreign Citizens earn more than 50k / year


In [None]:
race_data = {}
white = 0
total = 0

for _, row in census.iterrows():
    race = row['race']

    if race == 'White':
        white += row['fnlwgt']
    total += row['fnlwgt']

    if not race in race_data:
        race_data[race] = {
            '>50K': 0,
            '<=50K': 0,
            'total': 0
        }

    race_data[race][row['class']] += 1
    race_data[race]['total'] += 1

print(f"{(white / total) * 100}% white")

for race in race_data:
    perc = race_data[race]['>50K'] / race_data[race]['total']

    print(f"{race}: {race_data[race]['>50K']} out of {race_data[race]['total']} earn more than 50K / year ({perc})")

In [None]:
from preprocess import median

lt_50k_ages = []
gt_50k_ages = []

for _, row in census.iterrows():
    if row['class'] == '>50K':
        gt_50k_ages.append(row['age'])
    else:
        lt_50k_ages.append(row['age'])

lt_50k_ages = sorted(lt_50k_ages)
gt_50k_ages = sorted(gt_50k_ages)

print(f"Median Age of <=50K: {median(lt_50k_ages, lambda x: x)}")
print(f"Median Age of >50K: {median(gt_50k_ages, lambda x: x)}")

In [6]:
from preprocess import median

lt_50k_ages = []
gt_50k_ages = []

for _, row in census.iterrows():
    if row['class'] == '>50K':
        gt_50k_ages.append(row['age'])
    else:
        lt_50k_ages.append(row['age'])

lt_50k_ages = sorted(lt_50k_ages)
gt_50k_ages = sorted(gt_50k_ages)

print(f"Median Age of <=50K: {median(lt_50k_ages, lambda x: x)}")
print(f"Median Age of >50K: {median(gt_50k_ages, lambda x: x)}")

85.54984303198508% white
Black: 510 out of 4198 earn more than 50K / year (0.12148642210576464)
White: 9553 out of 37606 earn more than 50K / year (0.2540286124554592)
Asian-Pac-Islander: 367 out of 1370 earn more than 50K / year (0.2678832116788321)
Other: 44 out of 364 earn more than 50K / year (0.12087912087912088)
Amer-Indian-Eskimo: 49 out of 420 earn more than 50K / year (0.11666666666666667)


In [8]:
from preprocess import median

lt_50k_ages = []
gt_50k_ages = []

for _, row in census.iterrows():
    if row['class'] == '>50K':
        gt_50k_ages.append(row['age'])
    else:
        lt_50k_ages.append(row['age'])

lt_50k_ages = sorted(lt_50k_ages)
gt_50k_ages = sorted(gt_50k_ages)

print(f"Median Age of <=50K: {median(lt_50k_ages, lambda x: x)}")
print(f"Median Age of >50K: {median(gt_50k_ages, lambda x: x)}")

Median Age of <=50K: 34.0
Median Age of >50K: 44.0
