In [30]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OrdinalEncoder

# OrdinalEncoder is used to convert categorical (non-numeric) features into numbers.
# Example: ['cat', 'dog', 'fish'] → [0, 1, 2]
# - It assigns each unique category in a feature column an integer value.
# - This is useful because machine learning models generally require numeric input.
# - Unlike OneHotEncoder, it does NOT create extra columns for each category;
#   instead, it just replaces them with integers.
# - It works best when the categorical values have an ORDER (e.g., 'low', 'medium', 'high').
#   If there’s no order (e.g., 'red', 'blue', 'green'), the numeric encoding may
#   accidentally imply an order that doesn’t exist.

In [43]:
data = fetch_openml('car',as_frame=True).frame

- version 2, status: active
  url: https://www.openml.org/search?type=data&id=991
- version 3, status: active
  url: https://www.openml.org/search?type=data&id=40975



In [44]:
data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,binaryClass
0,vhigh,vhigh,2,2,small,low,P
1,vhigh,vhigh,2,2,small,med,P
2,vhigh,vhigh,2,2,small,high,P
3,vhigh,vhigh,2,2,med,low,P
4,vhigh,vhigh,2,2,med,med,P
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,N
1724,low,low,5more,more,med,high,N
1725,low,low,5more,more,big,low,P
1726,low,low,5more,more,big,med,N


In [45]:
# Choose the categorical columns we want to encode
columns_to_encode = ['lug_boot','safety']

# Create an OrdinalEncoder with a specific order for each column
# - For 'lug_boot': small < med < big
# - For 'safety' : low < med < high
# The order matters because OrdinalEncoder converts categories into integers
# based on the sequence you define here.
encoder = OrdinalEncoder(
    categories = [
        ['small', 'med', 'big'],   # 'small' -> 0, 'med' -> 1, 'big' -> 2
        ['low','med','high'],      # 'low' -> 0, 'med' -> 1, 'high' -> 2
    ],
)

# Apply the encoder to the chosen columns
# fit_transform():
#   1. "fit" learns the mapping from text categories -> numbers
#   2. "transform" replaces the text in the dataset with those numbers
data[columns_to_encode] = data[columns_to_encode].astype(str) # -> convers it to string so below can encode it
data[columns_to_encode] = encoder.fit_transform(data[columns_to_encode])

data


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,binaryClass
0,vhigh,vhigh,2,2,0.0,0.0,P
1,vhigh,vhigh,2,2,0.0,1.0,P
2,vhigh,vhigh,2,2,0.0,2.0,P
3,vhigh,vhigh,2,2,1.0,0.0,P
4,vhigh,vhigh,2,2,1.0,1.0,P
...,...,...,...,...,...,...,...
1723,low,low,5more,more,1.0,1.0,N
1724,low,low,5more,more,1.0,2.0,N
1725,low,low,5more,more,2.0,0.0,P
1726,low,low,5more,more,2.0,1.0,N


In [46]:
# Turns them back to how they were
encoder.inverse_transform(data[columns_to_encode])

array([['small', 'low'],
       ['small', 'med'],
       ['small', 'high'],
       ...,
       ['big', 'low'],
       ['big', 'med'],
       ['big', 'high']], shape=(1728, 2), dtype=object)

In [49]:
data = fetch_openml('adult',as_frame=True).frame
data

- version 1, status: active
  url: https://www.openml.org/search?type=data&id=179
- version 2, status: active
  url: https://www.openml.org/search?type=data&id=1590



Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States,<=50K
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States,<=50K
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States,<=50K
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States,<=50K
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,2,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,2,United-States,<=50K
48838,4,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,2,United-States,<=50K
48839,2,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,3,United-States,<=50K
48840,2,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,2,0,2,United-States,<=50K


In [50]:
# OneHotEncoder is a preprocessing tool from scikit-learn.
# It is used to convert categorical (non-numeric) data into a format that 
# machine learning models can understand.
#
# How it works:
# - Each unique category in a column is turned into a new column (a binary feature).
# - The value is marked as 1 if the row belongs to that category, and 0 otherwise.
#
# Example:
#   Original "Color" column:  ['Red', 'Blue', 'Green']
#   After OneHotEncoding: 
#       Color_Red   Color_Blue   Color_Green
#          1            0             0
#          0            1             0
#          0            0             1
#
# Key points:
# - Unlike OrdinalEncoder, OneHotEncoder does NOT imply any order between categories.
# - It’s the standard choice when categories are just labels (like 'dog', 'cat', 'fish').
# - It prevents models from mistakenly thinking 'Red'=0, 'Blue'=1, 'Green'=2 means Red<Blue<Green.
from sklearn.preprocessing import OneHotEncoder

In [52]:
# .occupation → selects the column "occupation" from the DataFrame 'data'
# .value_counts() → counts how many times each unique value appears in that column
data.occupation.value_counts()

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [53]:
import pandas as pd

# Create a OneHotEncoder
# - handle_unknown='ignore' → if new categories appear in test data that were not seen during fit,
#   the encoder will ignore them instead of throwing an error.
# - sparse_output=False → return a dense NumPy array (normal 2D array),
#   instead of a sparse matrix (saves memory but harder to read).
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit the encoder on 'occupation' and 'race' columns and transform them
# - fit(): learns all unique categories from these two columns
# - transform(): replaces each category with binary indicator columns (0/1)
encoded_values = encoder.fit_transform(data[['occupation', 'race']])

# Get the new column names created by OneHotEncoder
# Example: if 'occupation' has categories ['teacher','doctor']
#          and 'race' has categories ['white','black']
# Then new_cols will be:
#   ['occupation_doctor', 'occupation_teacher', 'race_black', 'race_white']
new_cols = encoder.get_feature_names_out(['occupation', 'race'])
new_cols

array(['occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Handlers-cleaners',
       'occupation_Machine-op-inspct', 'occupation_Other-service',
       'occupation_Priv-house-serv', 'occupation_Prof-specialty',
       'occupation_Protective-serv', 'occupation_Sales',
       'occupation_Tech-support', 'occupation_Transport-moving',
       'occupation_nan', 'race_Amer-Indian-Eskimo',
       'race_Asian-Pac-Islander', 'race_Black', 'race_Other',
       'race_White'], dtype=object)

In [57]:
# Create a new DataFrame with encoded values
df_encoded = pd.DataFrame(encoded_values, columns=new_cols, index=data.index)

# Drop the original 'occupation' and 'race' columns from data,
# then concatenate the rest of the dataset with the new encoded columns
data_final = pd.concat(
    [data.drop(columns=['occupation', 'race']), df_encoded],
    axis=1
)

data_final

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,relationship,sex,capitalgain,capitalloss,...,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,occupation_nan,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,2,State-gov,77516,Bachelors,13,Never-married,Not-in-family,Male,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Husband,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,Private,215646,HS-grad,9,Divorced,Not-in-family,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,Private,234721,11th,7,Married-civ-spouse,Husband,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Wife,Female,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,2,Private,215419,Bachelors,13,Divorced,Not-in-family,Female,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
48838,4,,321403,HS-grad,9,Widowed,Other-relative,Male,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
48839,2,Private,374983,Bachelors,13,Married-civ-spouse,Husband,Male,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
48840,2,Private,83891,Bachelors,13,Divorced,Own-child,Male,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
