In [50]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit,KFold
from sklearn.metrics import accuracy_score,fbeta_score,f1_score
import numpy as np
import matplotlib.pyplot as plt
import re

ADULT INCOME

In [51]:
column_names = [
    "age", # continuous
    "workclass", # Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked
    "fnlwgt", # continuous 
    "education", # Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool
    "education-num", # continuous. 
    "marital-status", # Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
    "occupation", # Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
    "relationship", # Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried 
    "race", # White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black, 
    "sex", # Female, Male, 
    "capital-gain", # continuous, 
    "capital-loss", # continuous, 
    "hours-per-week", # continuous, 
    "native-country", # United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands 
    "target" # >50K, <=50K
]
df = pd.read_csv(filepath_or_buffer="adult.data.txt", sep=',', names=column_names, skipinitialspace=True)

In [52]:
df = df.drop(['fnlwgt','education-num'],1)

In [53]:
print(f"Shape: {df.shape}")
print("")
print("Missing values")
print("--------------")
for col in df.columns.value:
    if df[col].dtype == object:
        missing_count = (df[col] == "?").sum()
        print(f"{col}:{missing_count}")

Shape: (32561, 13)

Missing values
--------------
workclass:1836


KeyError: 'fnlwgt'

In [29]:
df["target"] = df["target"].map({"<=50K": 0, ">50K": 1})
df["workclass"] = df["workclass"].replace(to_replace='?', value=np.nan)
df["occupation"] = df["occupation"].replace(to_replace='?', value=np.nan)
df["native-country"] = df["native-country"].replace(to_replace='?', value=np.nan)
df = df.dropna()

In [37]:
replacements = { 'Cambodia':' SE-Asia',
                'Canada':' British-Commonwealth',
                'China':' China',
                'Columbia':' South-America',
                'Cuba':' Other',
                'Dominican-Republic':' Latin-America',
                'Ecuador':' South-America',
                'El-Salvador':' South-America ',
                'England':' British-Commonwealth',
                'France':' Euro_1',
                'Germany':' Euro_1',
                'Greece':' Euro_2',
                'Guatemala':' Latin-America',
                'Haiti':' Latin-America',
                'Holand-Netherlands':' Euro_1',
                'Honduras':' Latin-America',
                'Hong':' China',
                'Hungary':' Euro_2',
                'India':' British-Commonwealth',
                'Iran':' Other',
                'Ireland':' British-Commonwealth',
                'Italy':' Euro_1',
                'Jamaica':' Latin-America',
                'Japan':' Other',
                'Laos':' SE-Asia',
                'Mexico':' Latin-America',
                'Nicaragua':' Latin-America',
                'Outlying-US(Guam-USVI-etc)':' Latin-America',
                'Peru':' South-America',
                'Philippines':' SE-Asia',
                'Poland':' Euro_2',
                'Portugal':' Euro_2',
                'Puerto-Rico':' Latin-America',
                'Scotland':' British-Commonwealth',
                'South':' Euro_2',
                'Taiwan':' China',
                'Thailand':' SE-Asia',
                'Trinadad&Tobago':' Latin-America',
                'United-States':' United-States',
                'Vietnam':' SE-Asia',
                'Yugoslavia':' Euro_2'}
df['native-country'] = df['native-country'].str.strip()
df = df.replace(to_replace={'native-country': replacements, 'relationship': {' Husband': 'Spouse',' Wife':'Spouse'}})    
df['native-country'] = df['native-country'].str.strip()

KeyError: "labels ['fnlwt' 'edu'] not contained in axis"

In [36]:
pd.get_dummies(df.drop("target", axis=1)).shape

(30162, 72)

In [49]:
df.to_csv(path_or_buf='adult_cleaned.csv', index=False)