<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/OrdinalNominalEncodings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

In [2]:
color_col = [
    'Red',
    'Blue',
    'Yellow',
    'Red',
    'Red',
]

breakfast_col = [
    'Every day',
    'Never',
    'Rarely',
    'Most days',
    'Never',
    ]

age_col = [
    6, 8, 10, 7, 43,
]

brand_col = [
    'Toyota',
    'Mercedes',
    'Mercedes',
    'Tesla',
    'Tesla',
]

weather_col = [
    'Hot',
    'Cold',
    'Cold',
    'Cold',
    'Cold',
]

df = pd.DataFrame(
    data={
        'Color': color_col,
        'Breakfast': breakfast_col,
        'Age': age_col,
        'Brand': brand_col,
        'Weather': weather_col,
    },
    index=list('ABCDE')
)
df

Unnamed: 0,Color,Breakfast,Age,Brand,Weather
A,Red,Every day,6,Toyota,Hot
B,Blue,Never,8,Mercedes,Cold
C,Yellow,Rarely,10,Mercedes,Cold
D,Red,Most days,7,Tesla,Cold
E,Red,Never,43,Tesla,Cold


In [3]:
numerical_df = df.select_dtypes(exclude=['object'])

categorical_df = df.select_dtypes(include=['object'])
categorical_df

Unnamed: 0,Color,Breakfast,Brand,Weather
A,Red,Every day,Toyota,Hot
B,Blue,Never,Mercedes,Cold
C,Yellow,Rarely,Mercedes,Cold
D,Red,Most days,Tesla,Cold
E,Red,Never,Tesla,Cold


In [4]:
numerical_df

Unnamed: 0,Age
A,6
B,8
C,10
D,7
E,43


In [5]:
for col in categorical_df.columns:
    print(f'{col:10} has {categorical_df[col].nunique():3} unique categories.')

new_line = '\n'
dash = '-'
print(f'{new_line}distribution of categories by columns{new_line}{dash*45}')

for col in categorical_df.columns:
    print(f'{categorical_df[col].value_counts()}{new_line}')

Color      has   3 unique categories.
Breakfast  has   4 unique categories.
Brand      has   3 unique categories.
Weather    has   2 unique categories.

distribution of categories by columns
---------------------------------------------
Red       3
Yellow    1
Blue      1
Name: Color, dtype: int64

Never        2
Most days    1
Every day    1
Rarely       1
Name: Breakfast, dtype: int64

Tesla       2
Mercedes    2
Toyota      1
Name: Brand, dtype: int64

Cold    4
Hot     1
Name: Weather, dtype: int64



In [0]:
nominal_columns = [
    'Color',
    'Brand',
]

ordinal_columns = [
    'Breakfast',
    'Weather',
]

breakfast_rank_dict = {
    'Never': 1,
    'Rarely': 2,
    'Most days': 3,
    'Every day': 4,
}

weather_rank_dict = {
    'Cold': 1,
    'Hot': 2,
}

ordinal_rank_dict = {
    'Breakfast': breakfast_rank_dict,
    'Weather': weather_rank_dict,
}

In [7]:
ordinal_rank_dict['Breakfast']

{'Every day': 4, 'Most days': 3, 'Never': 1, 'Rarely': 2}

In [8]:
ordinal_rank_dict['Weather']

{'Cold': 1, 'Hot': 2}

In [9]:
ordinal_df = categorical_df[ordinal_columns].copy()
nominal_df = categorical_df[nominal_columns].copy()
nominal_df

Unnamed: 0,Color,Brand
A,Red,Toyota
B,Blue,Mercedes
C,Yellow,Mercedes
D,Red,Tesla
E,Red,Tesla


In [10]:
ordinal_df

Unnamed: 0,Breakfast,Weather
A,Every day,Hot
B,Never,Cold
C,Rarely,Cold
D,Most days,Cold
E,Never,Cold


In [11]:
for col in ordinal_columns:
    ordinal_df[col + '_encoded'] = ordinal_df[col].map(lambda category: ordinal_rank_dict[col][category])

ordinal_df

Unnamed: 0,Breakfast,Weather,Breakfast_encoded,Weather_encoded
A,Every day,Hot,4,2
B,Never,Cold,1,1
C,Rarely,Cold,2,1
D,Most days,Cold,3,1
E,Never,Cold,1,1


In [12]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse=False)

one_hot_encoded_array = one_hot_encoder.fit_transform(nominal_df)
one_hot_encoded_array

array([[0., 1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0.]])

In [13]:
all_categories = []
for categories in one_hot_encoder.categories_:
    all_categories.extend(categories)
all_categories

['Blue', 'Red', 'Yellow', 'Mercedes', 'Tesla', 'Toyota']

In [14]:
one_hot_df = pd.DataFrame(data=one_hot_encoded_array,
             columns=all_categories,
             index=nominal_df.index)
one_hot_df

Unnamed: 0,Blue,Red,Yellow,Mercedes,Tesla,Toyota
A,0.0,1.0,0.0,0.0,0.0,1.0
B,1.0,0.0,0.0,1.0,0.0,0.0
C,0.0,0.0,1.0,1.0,0.0,0.0
D,0.0,1.0,0.0,0.0,1.0,0.0
E,0.0,1.0,0.0,0.0,1.0,0.0


In [15]:
encoded_categorical_df = pd.concat(
    [ordinal_df,
     nominal_df,
     one_hot_df,
    ],
    axis='columns'
)

dataframes = [
    numerical_df,
    encoded_categorical_df,
]

pd.concat(dataframes, axis='columns')

Unnamed: 0,Age,Breakfast,Weather,Breakfast_encoded,Weather_encoded,Color,Brand,Blue,Red,Yellow,Mercedes,Tesla,Toyota
A,6,Every day,Hot,4,2,Red,Toyota,0.0,1.0,0.0,0.0,0.0,1.0
B,8,Never,Cold,1,1,Blue,Mercedes,1.0,0.0,0.0,1.0,0.0,0.0
C,10,Rarely,Cold,2,1,Yellow,Mercedes,0.0,0.0,1.0,1.0,0.0,0.0
D,7,Most days,Cold,3,1,Red,Tesla,0.0,1.0,0.0,0.0,1.0,0.0
E,43,Never,Cold,1,1,Red,Tesla,0.0,1.0,0.0,0.0,1.0,0.0
