In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
col_names = [
    'age', 
    'workclass', 
    'fnlwgt', 
    'education', 
    'education-num', 
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week', 
    'native-country',
    'income']

np.random.seed(0)

In [4]:
DATA_DIR = "data"
fn = os.path.join(DATA_DIR, "adult.csv")
df = pd.read_csv(fn, header=None, sep=";", names=col_names)

In [8]:
df.iloc[:, [9]]

Unnamed: 0,sex
0,Male
1,Male
2,Male
3,Male
4,Female
5,Female
6,Female
7,Male
8,Female
9,Male


In [6]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
df.where(df.age == 39).dropna(axis=0).iloc[:, [0,8] ]

In [5]:
numeric_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain','capital-loss', 'hours-per-week']
categorical_cols = [c for c in col_names if c not in numeric_cols]

# here we select which are the input columns and which the output columns
target_col = 'native-country'
source_cols = [c for c in df.columns if c!=target_col]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, [c for c in numeric_cols if c!=target_col]),
        ('cat', categorical_transformer, [c for c in categorical_cols if c!=target_col])])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

X = df.drop(target_col, axis=1)
y = df[target_col]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))



model score: 0.899
