# Categorical Variables

In [1]:
import pandas as pd

adult_census = pd.read_csv("datasets/adult-census.csv")

target_name = "class"

target = adult_census[target_name]

data = adult_census.drop(columns=[target_name])

In [2]:
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States


In [3]:
data["native-country"].value_counts()

native-country
United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Nicaragua                        49
Greece                           49
Peru         

In [4]:
data.dtypes

age                int64
workclass         object
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
dtype: object

In [6]:
from sklearn.compose import make_column_selector as selector

In [7]:
categorical_column_selector = selector(dtype_include=object)

categorical_columns = categorical_column_selector(data)

categorical_columns

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [8]:
data_categorical = data[categorical_columns]

data_categorical.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States
1,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States
2,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States
3,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States
4,?,Some-college,Never-married,?,Own-child,White,Female,United-States


In [10]:
from sklearn.preprocessing import OrdinalEncoder

In [14]:
education_column = data_categorical[["education"]]

In [19]:
encoder = OrdinalEncoder().set_output(transform="pandas")

education_encoded = encoder.fit_transform(education_column)

In [22]:
education_encoded

Unnamed: 0,education
0,1.0
1,11.0
2,7.0
3,15.0
4,15.0
...,...
48837,7.0
48838,11.0
48839,11.0
48840,11.0


In [23]:
encoder.categories_

[array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object)]

In [29]:
encoder_manual = OrdinalEncoder(
    categories=[[' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Masters', ' Doctorate',
        ' HS-grad', ' Preschool', ' Prof-school',
        ' Some-college']]
).set_output(transform="pandas")

In [33]:
mapping = {
    index: cat for index, cat in enumerate(encoder.categories_[0])
}
mapping

{0: ' 10th',
 1: ' 11th',
 2: ' 12th',
 3: ' 1st-4th',
 4: ' 5th-6th',
 5: ' 7th-8th',
 6: ' 9th',
 7: ' Assoc-acdm',
 8: ' Assoc-voc',
 9: ' Bachelors',
 10: ' Doctorate',
 11: ' HS-grad',
 12: ' Masters',
 13: ' Preschool',
 14: ' Prof-school',
 15: ' Some-college'}

In [30]:
encoder_manual.fit_transform(education_column)

Unnamed: 0,education
0,1.0
1,12.0
2,7.0
3,15.0
4,15.0
...,...
48837,7.0
48838,12.0
48839,12.0
48840,12.0


In [31]:
encoder_manual.categories_

[array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Masters',
        ' Doctorate', ' HS-grad', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object)]

In [34]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")

In [35]:
education_encoded = encoder.fit_transform(education_column)

In [37]:
education_encoded.shape

(48842, 16)

In [38]:
data["native-country"].value_counts()

native-country
United-States                 43832
Mexico                          951
?                               857
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Nicaragua                        49
Greece                           49
Peru         

In [39]:
encoder = OneHotEncoder(handle_unknown="ignore")

In [42]:
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

In [44]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"), LogisticRegression(max_iter=500)
)

In [45]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data_categorical, target)
cv_results

{'fit_time': array([0.30598044, 0.31365252, 0.30656219, 0.29843903, 0.24687195]),
 'score_time': array([0.03288913, 0.03646851, 0.040169  , 0.03167367, 0.03334761]),
 'test_score': array([0.83232675, 0.83570478, 0.82831695, 0.83292383, 0.83497133])}

In [56]:
model_ordinal = make_pipeline(
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    LogisticRegression(max_iter=500)
)

In [57]:
cv_results_ordinal = cross_validate(model_ordinal, data_categorical, target)
cv_results_ordinal

{'fit_time': array([0.41120839, 0.21462631, 0.29758072, 0.28427601, 0.25031972]),
 'score_time': array([0.03851843, 0.02946305, 0.03268313, 0.03133583, 0.02902174]),
 'test_score': array([0.75514382, 0.75555328, 0.75573301, 0.75358313, 0.75788288])}

## Using numerical and categorical variables together

In [47]:
numerical_column_selector = selector(dtype_exclude=object)
categorical_column_selector = selector(dtype_include=object)

numerical_columns = numerical_column_selector(data)
categorical_columns = categorical_column_selector(data)

In [48]:
from sklearn.preprocessing import StandardScaler

In [49]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [50]:
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
    (categorical_preprocessor, categorical_columns),
    (numerical_preprocessor, numerical_columns)
)

In [52]:
model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

In [54]:
cv_results = cross_validate(model, data, target)
cv_results

{'fit_time': array([0.43418956, 0.31362128, 0.34391832, 0.29754353, 0.28763533]),
 'score_time': array([0.04044676, 0.0388155 , 0.03717804, 0.03780985, 0.03593969]),
 'test_score': array([0.85136657, 0.8498311 , 0.84766585, 0.85288698, 0.85554873])}

In [55]:
cv_results["test_score"].mean()

np.float64(0.8514598444376313)