<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/OrdinalNominalProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

In [2]:
color_col = [
    'Red',
    'Blue',
    'Yellow',
    'Red',
    'Red',
]

breakfast_col = [
    'Every day',
    'Never',
    'Rarely',
    'Most days',
    'Never',
    ]

age_col = [
    6, 8, 10, 7, 43,
]

brand_col = [
    'Toyota',
    'Mercedes',
    'Mercedes',
    'Tesla',
    'Tesla',
]

weather_col = [
    'Hot',
    'Cold',
    'Cold',
    'Cold',
    'Cold',
]

df = pd.DataFrame(
    data={
        'Color': color_col,
        'Breakfast': breakfast_col,
        'Age': age_col,
        'Brand': brand_col,
        'Weather': weather_col,
    },
    index=list('ABCDE')
)
df

Unnamed: 0,Color,Breakfast,Age,Brand,Weather
A,Red,Every day,6,Toyota,Hot
B,Blue,Never,8,Mercedes,Cold
C,Yellow,Rarely,10,Mercedes,Cold
D,Red,Most days,7,Tesla,Cold
E,Red,Never,43,Tesla,Cold


In [0]:
nominal_columns = [
    'Color',
    'Brand',
]

ordinal_columns = [
    'Breakfast',
    'Weather',
]

breakfast_rank_dict = {
    'Never': 1,
    'Rarely': 2,
    'Most days': 3,
    'Every day': 4,
}

weather_rank_dict = {
    'Cold': 1,
    'Hot': 2,
}

ordinal_rank_dict = {
    'Breakfast': breakfast_rank_dict,
    'Weather': weather_rank_dict,
}

def categorical_column_stats(categorical_df):
    for col in categorical_df.columns:
        print(f'{col:10} has {categorical_df[col].nunique():3} unique categories.')

    new_line = '\n'
    dash = '-'
    print(f'{new_line}distribution of categories by columns{new_line}{dash*45}')

    for col in categorical_df.columns:
        print(f'{categorical_df[col].value_counts()}{new_line}')    

def split_nominal_categorical_columns(categorical_df,
                                      ordinal_columns,
                                      nominal_columns):
    """Return the ordinal_df, nominal_df tuple.
    """
    ordinal_df = categorical_df[ordinal_columns].copy()
    nominal_df = categorical_df[nominal_columns].copy()
    return ordinal_df, nominal_df


def custom_mapping_ordinal_columns(ordinal_columns,
                                   ordinal_df,
                                   ordinal_rank_dict):
    for col in ordinal_columns:
        ordinal_df[col + '_encoded'] = ordinal_df[col].map(lambda category: ordinal_rank_dict[col][category])

def one_hot_encoding(nominal_df):
    """Return the One-Hot encoded DataFrame.
    """
    one_hot_encoder = OneHotEncoder(sparse=False)

    one_hot_encoded_array = one_hot_encoder.fit_transform(nominal_df)

    all_one_hot_categories = []
    for categories in one_hot_encoder.categories_:
        all_one_hot_categories.extend(categories)

    one_hot_df = pd.DataFrame(data=one_hot_encoded_array,
                columns=all_one_hot_categories,
                index=nominal_df.index)
    return one_hot_df

def categorical_encoding(nominal_columns, ordinal_columns,
                         ordinal_rank_dict):
    """Return the encoded categorical columns as a DataFrame.
    """
    numerical_df = df.select_dtypes(exclude=['object'])

    categorical_df = df.select_dtypes(include=['object'])
    print(f'numerical columns excluded: {numerical_df.columns}')
    
    categorical_column_stats(categorical_df)

    ordinal_df, nominal_df = split_nominal_categorical_columns(
                                    categorical_df,
                                    ordinal_columns,
                                    nominal_columns)

    custom_mapping_ordinal_columns(ordinal_columns,
                                   ordinal_df,
                                   ordinal_rank_dict)

    one_hot_df = one_hot_encoding(nominal_df)

    encoded_categorical_df = pd.concat(
        [ordinal_df,
        nominal_df,
        one_hot_df,
        ],
        axis='columns'
    )
    return encoded_categorical_df


In [4]:
categorical_encoding(nominal_columns, ordinal_columns,
                     ordinal_rank_dict)

numerical columns excluded: Index(['Age'], dtype='object')
Color      has   3 unique categories.
Breakfast  has   4 unique categories.
Brand      has   3 unique categories.
Weather    has   2 unique categories.

distribution of categories by columns
---------------------------------------------
Red       3
Blue      1
Yellow    1
Name: Color, dtype: int64

Never        2
Most days    1
Every day    1
Rarely       1
Name: Breakfast, dtype: int64

Mercedes    2
Tesla       2
Toyota      1
Name: Brand, dtype: int64

Cold    4
Hot     1
Name: Weather, dtype: int64



Unnamed: 0,Breakfast,Weather,Breakfast_encoded,Weather_encoded,Color,Brand,Blue,Red,Yellow,Mercedes,Tesla,Toyota
A,Every day,Hot,4,2,Red,Toyota,0.0,1.0,0.0,0.0,0.0,1.0
B,Never,Cold,1,1,Blue,Mercedes,1.0,0.0,0.0,1.0,0.0,0.0
C,Rarely,Cold,2,1,Yellow,Mercedes,0.0,0.0,1.0,1.0,0.0,0.0
D,Most days,Cold,3,1,Red,Tesla,0.0,1.0,0.0,0.0,1.0,0.0
E,Never,Cold,1,1,Red,Tesla,0.0,1.0,0.0,0.0,1.0,0.0
