In [None]:
"""
- label encoding
- one hot encoding
- binarization
"""
from sklearn import preprocessing

class CategoricalFeatures:
    """
    df: pandas dataframe
    categorical features: list of columns names ["ord_1", "nom_0", ......]
    encoding_type: label, binary, ohe
    """


    def __init__(self, df, categorical_features, encoding_type, handle_na=True):
        self.df = df
        self.output_df = self.df.copy(deep=True)
        self.cat_feats = categorical_features
        self.enc_type = encoding_type
        self.label_encoders = dict() # point the encoder by a dictionary;
        self.binary_encoders = dict()

        # converting all the data into strings and fill nan by some imaginary number(str)
        for c in self.cat_feats:
            self.df.loc[:, c] = df.loc[:, c].astype(str).fillna("-99999999")

    def _label_encoding(self): # interial function just return inward this class
        for c in self.cat_feats:
            lbl = preprocessing.LabelEncoder()
            lbl.fit(self.df[c].values)
            self.output_df.loc[:, c] = lbl.transform(self.df[c].values)
            self.label_encoders[c] = lbl # label encoder;
        return self.output_df

    def _label_binarization(self):
        for c in self.cat_feats:
            lbl = preprocessing.LabelBinarizer()
            lbl.fit(self.df[c].values)
            val = lbl.transform(self.df[c].values)
            self.output_df = self.output_df.drop(c, axis=1)
            for j in range(val.shape[1]):
                new_col_name = c + f"_bin_{j}"
                self.output_df[new_col_name] = val[:, j]
            self.binary_encoders[c] = lbl # binarizer encoder
        return self.output_df

    def fit_transform(self):
        if self.enc_type == "label":
            return self._label_encoding()
        elif self.enc_type == "binary":
            return self._label_binarization()
        else:
            raise Exception("Encoding type not understood")
    
    def transform(self): # outward return function in this class
        if self.enc_type == "label":
            return self._label_encoding()
        else:
            raise Exception("Encoding type not understood")

if __name__ == "__main__":
    import pandas as pd
    df = pd.read_csv("../input/train_cat.csv")
    enc_type = "label"
    cols = [c for c in df.columns if c not in ["id", "target"]]
    print(cols)
    cat_feats = CategoricalFeatures(df=df, # by TAB type author prefer to cat_feats this macroed name;
                                    categorical_features=cols,
                                    encoding_type="binary",
                                    handle_na=True)
    output_df = cat_feats.fit_transform() # return output from transform() function;
    print(output_df.head())

['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month']
