# Encoding Categorical Variables dataset generation

In [1]:
import os
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
from feature_engine.encoding import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

np.random.seed(42)

In [2]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

In [3]:
# metadata
adult.metadata

{'uci_id': 2,
 'name': 'Adult',
 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult',
 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv',
 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ',
 'area': 'Social Science',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 48842,
 'num_features': 14,
 'feature_types': ['Categorical', 'Integer'],
 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'],
 'target_col': ['income'],
 'index_col': None,
 'has_missing_values': 'yes',
 'missing_values_symbol': 'NaN',
 'year_of_dataset_creation': 1996,
 'last_updated': 'Tue Sep 24 2024',
 'dataset_doi': '10.24432/C5XW20',
 'creators': ['Barry Becker', 'Ronny Kohavi'],
 'intro_paper': None,
 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was ex

In [4]:
# variable information
adult.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Integer,Age,,,no
1,workclass,Feature,Categorical,Income,"Private, Self-emp-not-inc, Self-emp-inc, Feder...",,yes
2,fnlwgt,Feature,Integer,,,,no
3,education,Feature,Categorical,Education Level,"Bachelors, Some-college, 11th, HS-grad, Prof-...",,no
4,education-num,Feature,Integer,Education Level,,,no
5,marital-status,Feature,Categorical,Other,"Married-civ-spouse, Divorced, Never-married, S...",,no
6,occupation,Feature,Categorical,Other,"Tech-support, Craft-repair, Other-service, Sal...",,yes
7,relationship,Feature,Categorical,Other,"Wife, Own-child, Husband, Not-in-family, Other...",,no
8,race,Feature,Categorical,Race,"White, Asian-Pac-Islander, Amer-Indian-Eskimo,...",,no
9,sex,Feature,Binary,Sex,"Female, Male.",,no


In [5]:
print(adult.metadata.additional_info.variable_info)

Listing of attributes:

>50K, <=50K.

age: continuous.
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
fnlwgt: continuous.
education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
education-num: continuous.
marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
sex: Female, Male.
capital-gain: continuous.
capital-loss: continuous.
hours-per-week: continuous.
native-country: Unite

In [6]:
X.shape

(48842, 14)

In [7]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [8]:
X = X.drop(columns=['education-num'])

In [9]:
X.isna().sum()


age                 0
workclass         963
fnlwgt              0
education           0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
dtype: int64

In [10]:
X.workclass.value_counts()

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
?                    1836
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [11]:
X.occupation.value_counts()

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
?                    1843
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [12]:
X['native-country'].value_counts()

native-country
United-States                 43832
Mexico                          951
?                               583
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru         

In [13]:
# Fill missing values with "missing" string
X['workclass'] = X['workclass'].fillna('missing')
X['occupation'] = X['occupation'].fillna('missing')
X['native-country'] = X['native-country'].fillna('missing')

# Verify no more missing values
print("Missing values after filling:")
print(X.isna().sum())

Missing values after filling:
age               0
workclass         0
fnlwgt            0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64


In [14]:
y.value_counts() / len(y)

income
<=50K     0.506122
<=50K.    0.254596
>50K      0.160538
>50K.     0.078744
Name: count, dtype: float64

In [15]:
# Convert income to binary (1 if >50K, 0 otherwise)
# The data seems to have two formats: with and without periods
y = y.map(lambda x: 1 if x in ['>50K', '>50K.'] else 0)

# Rename the series to be more descriptive
y.name = 'income>50k'

# Verify the conversion
print("Income distribution after conversion:")
print(y.value_counts() / len(y))

Income distribution after conversion:
income
0         0.760718
1         0.239282
Name: count, dtype: float64


In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train.to_csv('X_train_adult.csv', index=False)
X_test.to_csv('X_test_adult.csv', index=False)
y_train.to_csv('y_train_adult.csv', index=False)
y_test.to_csv('y_test_adult.csv', index=False)

# Baseline model with OHE to compare

In [18]:
ohe_enc = OneHotEncoder(drop_last=False)
ohe_enc.fit(X_train)
ohe_enc.variables_

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [19]:
ohe_enc.encoder_dict_

{'workclass': ['Private',
  'State-gov',
  'Local-gov',
  'Self-emp-not-inc',
  '?',
  'Self-emp-inc',
  'Federal-gov',
  'missing',
  'Without-pay',
  'Never-worked'],
 'education': ['HS-grad',
  'Masters',
  '12th',
  '11th',
  'Bachelors',
  'Some-college',
  '10th',
  'Doctorate',
  '7th-8th',
  'Prof-school',
  '9th',
  'Preschool',
  'Assoc-acdm',
  '5th-6th',
  'Assoc-voc',
  '1st-4th'],
 'marital-status': ['Married-civ-spouse',
  'Divorced',
  'Widowed',
  'Never-married',
  'Separated',
  'Married-spouse-absent',
  'Married-AF-spouse'],
 'occupation': ['Transport-moving',
  'Prof-specialty',
  'Farming-fishing',
  'Craft-repair',
  'Sales',
  'Other-service',
  'Protective-serv',
  'Exec-managerial',
  '?',
  'Handlers-cleaners',
  'Machine-op-inspct',
  'Adm-clerical',
  'Tech-support',
  'Priv-house-serv',
  'missing',
  'Armed-Forces'],
 'relationship': ['Husband',
  'Unmarried',
  'Not-in-family',
  'Other-relative',
  'Own-child',
  'Wife'],
 'race': ['White',
  'Black',


In [20]:
X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

In [21]:
# Initialize the model
rf_model = RandomForestClassifier(max_depth=10, random_state=42)

# Train the model
rf_model.fit(X_train_enc, y_train.values.ravel())

In [22]:
# Make predictions for train set and analyze results
y_pred = rf_model.predict(X_train_enc)

# Evaluate the model
accuracy = accuracy_score(y_train, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_train, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_pred))

Accuracy: 0.8649

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     29741
           1       0.83      0.54      0.66      9332

    accuracy                           0.86     39073
   macro avg       0.85      0.75      0.79     39073
weighted avg       0.86      0.86      0.85     39073


Confusion Matrix:
[[28712  1029]
 [ 4251  5081]]


In [23]:
# Make predictions for test set and analyze results
y_pred = rf_model.predict(X_test_enc)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8551

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.96      0.91      7414
           1       0.82      0.51      0.63      2355

    accuracy                           0.86      9769
   macro avg       0.84      0.74      0.77      9769
weighted avg       0.85      0.86      0.84      9769


Confusion Matrix:
[[7152  262]
 [1154 1201]]
