In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
%matplotlib inline

In [2]:
cols = ['age',
        'workclass',
        'fnlwgt',
        'education',
        'education-num',
        'marital-status',
        'occupation',
        'relationship',
        'race',
        'sex',
        'capital-gain',
        'capital-loss',
        'hours-per-week',
        'native-country',
        'outcome'
]
df = pd.read_csv('adult.data', names=cols, na_values='?' )

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,outcome
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.outcome.value_counts(dropna=False, normalize=True)

 <=50K    0.75919
 >50K     0.24081
Name: outcome, dtype: float64

In [5]:
df.workclass.value_counts(dropna=False)

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

# Cross-Validation

In [6]:
from sklearn import model_selection

# create new column kfold and fill in with -1
df['kfold'] = -1

# randomize the rows in dataframe
df.sample(frac=1).reset_index(drop=True)

# fetch labels
y = df.outcome.values

# initiate kafold
kf = model_selection.StratifiedKFold(n_splits=5)

# fill the new kfold column
for fold, (trn_, val_) in enumerate(kf.split(X=df, y=y)):
  df.loc[val_, 'kfold'] = fold

In [7]:
# save the new csv with kfold column
df.to_csv("adult_census_folds.csv", index=False)

In [13]:
df = pd.read_csv("adult_census_folds.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,outcome,kfold
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


# OHE + Logistic Regression Model

In [18]:
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
  # load the full training data with folds
  df = pd.read_csv('adult_census_folds.csv')

  # drop education-num
  df.drop(columns=['education-num'], inplace=True)

  # list of numerical columns
  num_cols = [
        "fnlwgt",
        "age",
        "capital.gain",
        "capital.loss",
        "hours.per.week"
  ]

  # map targets to 0s and 1s
  target_mapping = {
      ' <=50K': 0,
      ' >50K': 1,
  }

  df.loc[:, 'outcome'] = df.outcome.map(target_mapping)

  # all columns are features except kfold & outcome columns
  features = [f for f in df.columns if f not in ('kfold', 'outcome') ]

  # fill all NaN values with NONE
  for f in features:
    if f not in num_cols:
      df.loc[:, f] = df[f].astype(str).fillna('NONE')
  
  # get training data using folds
  df_train = df[df.kfold != fold].reset_index(drop=True)

  # get validation data using folds
  df_valid = df[df.kfold == fold].reset_index(drop=True)

  # initialize OneHotEncoder from scikit-learn
  ohe = preprocessing.OneHotEncoder()

  # fit ohe on training + validation features
  for f in features:
    if f not in num_cols:
      full_data = pd.concat([df_train[features], df_valid[features]])
      ohe.fit(full_data[features])
      # transform training data
      x_train = ohe.transform(df_train[features])
      # transform validation data
      x_valid = ohe.transform(df_valid[features])
  
  # initialize Logistic Regression model
  model = linear_model.LogisticRegression()

  # fit model on training data (ohe)
  model.fit(x_train, df_train.outcome.values)

  # predict on validation data
  valid_preds = model.predict_proba(x_valid)[:, 1]

  # get roc auc score
  auc = metrics.roc_auc_score(df_valid.outcome.values, valid_preds)

  # print auc
  print(f"Fold = {fold}, AUC = {auc}")


In [20]:
for fold_ in range(5):
  run(fold_)

Fold = 0, AUC = 0.9217743507397576
Fold = 1, AUC = 0.9206054236014793
Fold = 2, AUC = 0.9270881706748234
Fold = 3, AUC = 0.9266587437462849
Fold = 4, AUC = 0.9271822085933888
