<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/CategoricalPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

In [0]:
color_col = [
    'Red',
    'Blue',
    'Yellow',
    'Red',
    'Red',
]

breakfast_col = [
    'Every day',
    'Never',
    'Rarely',
    'Most days',
    'Never',
    ]

age_col = [
    6, 8, 10, 7, 43,
]

brand_col = [
    'Toyota',
    'Mercedes',
    'Mercedes',
    'Tesla',
    'Tesla',
]

weather_col = [
    'Hot',
    'Cold',
    'Cold',
    'Cold',
    'Cold',
]

df = pd.DataFrame(
    data={
        'Color': color_col,
        'Breakfast': breakfast_col,
        'Age': age_col,
        'Brand': brand_col,
        'Weather': weather_col,
    },
    index=list('ABCDE')
)
df

Unnamed: 0,Color,Breakfast,Age,Brand,Weather
A,Red,Every day,6,Toyota,Hot
B,Blue,Never,8,Mercedes,Cold
C,Yellow,Rarely,10,Mercedes,Cold
D,Red,Most days,7,Tesla,Cold
E,Red,Never,43,Tesla,Cold


In [0]:
nominal_columns = [
    'Color',
    'Brand',
]

ordinal_columns = [
    'Breakfast',
    'Weather',
]

breakfast_rank_dict = {
    'Never': 1,
    'Rarely': 2,
    'Most days': 3,
    'Every day': 4,
}

weather_rank_dict = {
    'Cold': 1,
    'Hot': 2,
}

ordinal_rank_dict = {
    'Breakfast': breakfast_rank_dict,
    'Weather': weather_rank_dict,
}



In [0]:
class CategoricalPreprocessing:
    def __init__(self, df, ordinal_columns, nominal_columns,
                 ordinal_rank_dict):
        self.df = df.copy()
        self.ordinal_columns = ordinal_columns
        self.nominal_columns = nominal_columns
        self.ordinal_rank_dict = ordinal_rank_dict

        self.numerical_df = df.select_dtypes(exclude=['object'])
        self.categorical_df = df.select_dtypes(include=['object'])
        self.ordinal_df = pd.DataFrame(data={})
        self.nominal_df = pd.DataFrame(data={})
        self._split_nominal_categorical_columns()

        self.one_hot_df = pd.DataFrame(data={})
        self.encoded_categorical_df = pd.DataFrame(data={})

    def categorical_column_stats(self):
        print(f'Categorical columns:')
        for col in self.categorical_df.columns:
            print(f'{col:10} has {self.categorical_df[col].nunique():3} unique categories.')

        new_line = '\n'
        dash = '-'
        print(f'{new_line}distribution of categories by columns{new_line}{dash*45}')

        for col in self.categorical_df.columns:
            print(f'{self.categorical_df[col].value_counts()}{new_line}')    

    def _split_nominal_categorical_columns(self):
        """Return the ordinal_df, nominal_df tuple.
        """
        self.ordinal_df = self.categorical_df[self.ordinal_columns].copy()
        self.nominal_df = self.categorical_df[self.nominal_columns].copy()


    def custom_mapping_ordinal_columns(self):
        for col in self.ordinal_columns:
            self.ordinal_df[col + '_encoded'] = self.ordinal_df[col].map(lambda category: self.ordinal_rank_dict[col][category])


    def one_hot_encoding(self):
        """Return the One-Hot encoded DataFrame.
        """
        one_hot_encoder = OneHotEncoder(sparse=False)

        one_hot_encoded_array = one_hot_encoder.fit_transform(self.nominal_df)

        all_one_hot_categories = []
        for categories in one_hot_encoder.categories_:
            all_one_hot_categories.extend(categories)

        self.one_hot_df = pd.DataFrame(data=one_hot_encoded_array,
                    columns=all_one_hot_categories,
                    index=self.nominal_df.index)

    def encode(self):
        self.encoded_categorical_df = pd.concat(
            [self.ordinal_df,
            self.nominal_df,
            self.one_hot_df,
            ],
            axis='columns'
        )

    def get_encoded_df(self):
        return self.encoded_categorical_df.copy()

cat = CategoricalPreprocessing(df, ordinal_columns, nominal_columns,
                               ordinal_rank_dict)
cat.categorical_column_stats()
cat.custom_mapping_ordinal_columns()
cat.one_hot_encoding()
cat.encode()
cat.get_encoded_df()



Categorical columns:
Color      has   3 unique categories.
Breakfast  has   4 unique categories.
Brand      has   3 unique categories.
Weather    has   2 unique categories.

distribution of categories by columns
---------------------------------------------
Red       3
Blue      1
Yellow    1
Name: Color, dtype: int64

Never        2
Rarely       1
Every day    1
Most days    1
Name: Breakfast, dtype: int64

Mercedes    2
Tesla       2
Toyota      1
Name: Brand, dtype: int64

Cold    4
Hot     1
Name: Weather, dtype: int64



Unnamed: 0,Breakfast,Weather,Breakfast_encoded,Weather_encoded,Color,Brand,Blue,Red,Yellow,Mercedes,Tesla,Toyota
A,Every day,Hot,4,2,Red,Toyota,0.0,1.0,0.0,0.0,0.0,1.0
B,Never,Cold,1,1,Blue,Mercedes,1.0,0.0,0.0,1.0,0.0,0.0
C,Rarely,Cold,2,1,Yellow,Mercedes,0.0,0.0,1.0,1.0,0.0,0.0
D,Most days,Cold,3,1,Red,Tesla,0.0,1.0,0.0,0.0,1.0,0.0
E,Never,Cold,1,1,Red,Tesla,0.0,1.0,0.0,0.0,1.0,0.0
