In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
df = pd.read_csv('adult_cencus.csv')

In [4]:
df.sample(n=5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
26466,36,Private,331395,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K
43071,32,Self-emp-not-inc,135304,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,3942,0,32,United-States,<=50K
47992,20,Private,398166,11th,7,Never-married,Other-service,Own-child,Black,Male,0,0,40,United-States,<=50K
30897,18,,267399,12th,8,Never-married,,Own-child,White,Female,0,0,12,United-States,<=50K
11264,57,Private,191983,Some-college,10,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,50,United-States,<=50K


In [5]:
df['class'].value_counts(dropna=False)

 <=50K    37155
 >50K     11687
Name: class, dtype: int64

#### Separate features matrix from target vector

In [6]:
data, target = df.drop(columns=['class']), df['class']

#### Separate numeric and categoric features

In [14]:
from sklearn.compose import make_column_selector as selector

numerical = selector(dtype_include=np.number)(data)
categorical = selector(dtype_include=object)(data)

In [15]:
numerical

['age',
 'fnlwgt',
 'education_num',
 'capital_gain',
 'capital_loss',
 'hours_per_week']

In [16]:
categorical

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country']

#### Since education variable is same as education_num i drop it.

In [11]:
# del categorical[categorical.index('education')]

In [12]:
categorical

['workclass',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country']

## Strategies to encode categories

### Encoding ordinal categories

The most intuitive strategy is to encode each category with a different
number. The `OrdinalEncoder` will transform the data in such manner.
We will start by encoding a single column to understand how the encoding
works.

In [17]:
data_categorical = data[categorical]
data_categorical.sample(n=3)

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country
35743,Private,HS-grad,Never-married,Craft-repair,Not-in-family,White,Male,United-States
24082,Private,HS-grad,Married-civ-spouse,Other-service,Husband,White,Male,United-States
12594,State-gov,Some-college,Divorced,Adm-clerical,Not-in-family,Black,Female,United-States


In [18]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
education_encoded = encoder.fit_transform(data_categorical[['education']])

In [19]:
education_encoded[:5]

array([[ 9.],
       [ 9.],
       [11.],
       [ 1.],
       [ 9.]])

We see that each category in `"education"` has been replaced by a numeric
value. We could check the mapping between the categories and the numerical
values by checking the fitted attribute `categories_`.

In [21]:
encoder.categories_

[array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object)]

Now, we can check the encoding applied on all categorical features.

In [22]:
data_encoded = encoder.fit_transform(data_categorical)
data_encoded[:5]

array([[ 6.,  9.,  4.,  0.,  1.,  4.,  1., 38.],
       [ 5.,  9.,  2.,  3.,  0.,  4.,  1., 38.],
       [ 3., 11.,  0.,  5.,  1.,  4.,  1., 38.],
       [ 3.,  1.,  2.,  5.,  0.,  2.,  1., 38.],
       [ 3.,  9.,  2.,  9.,  5.,  2.,  0.,  4.]])

In [23]:
data_encoded.shape

(48842, 8)

In [24]:
data_encoded = pd.DataFrame(data_encoded, columns=data_categorical.columns)
data_encoded.head()

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country
0,6.0,9.0,4.0,0.0,1.0,4.0,1.0,38.0
1,5.0,9.0,2.0,3.0,0.0,4.0,1.0,38.0
2,3.0,11.0,0.0,5.0,1.0,4.0,1.0,38.0
3,3.0,1.0,2.0,5.0,0.0,2.0,1.0,38.0
4,3.0,9.0,2.0,9.0,5.0,2.0,0.0,4.0
