In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit,KFold
from sklearn.metrics import accuracy_score,fbeta_score,f1_score
import numpy as np

In [6]:
column_names = [
    "age", # continuous
    "workclass", # Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
    "fnlwgt", # continuous 
    "education", # Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
    "education-num", # continuous. 
    "marital-status", # Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
    "occupation", # Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
    "relationship", # Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried 
    "race", # White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black, 
    "sex", # Female, Male, 
    "capital-gain", # continuous, 
    "capital-loss", # continuous, 
    "hours-per-week", # continuous, 
    "native-country", # United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands 
    "target" # >50K, <=50K
]
df = pd.read_csv(filepath_or_buffer="adult.data.txt", sep=',', names=column_names, skipinitialspace=True)

In [7]:
print(df.shape)
print(100 * (df["target"].value_counts() / len(df["target"])))
print("")
print("missing values")
for col in column_names:
    if df[col].dtype == object:
        missing_count = (df[col] == "?").sum()
        print(f"{col}:{missing_count}")

(32561, 15)
<=50K    75.919044
>50K     24.080956
Name: target, dtype: float64

missing values
workclass:1836
education:0
marital-status:0
occupation:1843
relationship:0
race:0
sex:0
native-country:583
target:0


In [9]:
df.shape

(32561, 15)

In [22]:
df["age_bin"] = pd.cut(df['age'], [0, 20,29,39,49,59,200], labels=["below_20", "20s", "30s", "40s","50s","above_60"])

In [23]:
df[["age", "age_bin"]]

Unnamed: 0,age,age_bin
0,39,30s
1,50,50s
2,38,30s
3,53,50s
4,28,20s
5,37,30s
6,49,40s
7,52,50s
8,31,30s
9,42,40s


In [26]:
df["age_bin"].value_counts(normalize=True)

30s         0.264519
20s         0.224225
40s         0.220356
50s         0.135684
above_60    0.081201
below_20    0.074015
Name: age_bin, dtype: float64

In [35]:
df[["age_bin", "target"]].groupby("age_bin").size()

age_bin
below_20    2410
20s         7301
30s         8613
40s         7175
50s         4418
above_60    2644
dtype: int64

In [82]:
df["target"] = df["target"].str.strip()
less_50k = df[df["target"] == "<=50K"]
more_50k = df[df["target"] == '>50K']

In [83]:
df[df["target"].str.strip() == ">50K"]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target,age_test,age_bin
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,50s,50s
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K,30s,30s
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K,40s,40s
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K,30s,30s
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K,30s,30s
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K,40s,40s
19,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States,>50K,40s,40s
20,40,Private,193524,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K,40s,40s
25,56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K,50s,50s
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K,50s,50s


In [89]:
less_50k["age_bin"].value_counts(normalize=True)

20s         0.274757
30s         0.255016
40s         0.182565
50s         0.109668
below_20    0.097411
above_60    0.080583
Name: age_bin, dtype: float64

In [88]:
more_50k["age_bin"].value_counts(normalize=True)

40s         0.339498
30s         0.294478
50s         0.217702
above_60    0.083153
20s         0.064915
below_20    0.000255
Name: age_bin, dtype: float64

In [91]:
less_50k["marital-status"].value_counts(normalize=True)

Never-married            0.412298
Married-civ-spouse       0.335113
Divorced                 0.161003
Separated                0.038794
Widowed                  0.036731
Married-spouse-absent    0.015534
Married-AF-spouse        0.000526
Name: marital-status, dtype: float64

In [90]:
more_50k["marital-status"].value_counts(normalize=True)

Married-civ-spouse       0.853463
Never-married            0.062620
Divorced                 0.059049
Widowed                  0.010840
Separated                0.008417
Married-spouse-absent    0.004336
Married-AF-spouse        0.001275
Name: marital-status, dtype: float64

In [92]:
from sklearn.tree import DecisionTreeClassifier

In [95]:
df_dummies = pd.get_dummies(df.drop('target', 1))
df_x = df_dummies.copy().values
df_y = df["target"].copy().values
clf = DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3,random_state=0)
clf.fit(X_train, y_train)
feature_importances = sorted(list(zip(df_dummies.columns.values, clf.feature_importances_)), key=lambda tup: tup[1], reverse=True)
print("feature importance")
for fi in feature_importances[:10]:
    print(fi)
print("")

feature importance
('marital-status_Married-civ-spouse', 0.19911998453564506)
('fnlwgt', 0.17540771920564502)
('education-num', 0.11263610239776166)
('capital-gain', 0.10299987665629544)
('age', 0.10038091291021688)
('hours-per-week', 0.06276243338810401)
('capital-loss', 0.03781330602550995)
('workclass_Private', 0.010661527235736209)
('workclass_Self-emp-not-inc', 0.0093041894218122)
('occupation_Exec-managerial', 0.009299318175953612)

