In [1]:
# For machine learning algorithms input should be in the same range, generally close to zero.
# This notebook implements a function which makes pandas dataframe scaling easy.

In [2]:
# Importing helper libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn import compose, preprocessing

In [3]:
# Dummy dataframe for tutorial
df = pd.DataFrame()
df['categorical_1'] = [-1, 0, 1, 2, 2, 2, 2, 2]
df['categorical_2'] = [1, 2, 3, 4, 5, 6, 7, 8]
df['continous_1'] = [0.5, 20, 1, 115, -0.5, 1.5, 0, 0.5]
df['continous_2'] = [0.5, 20, 1, 115, -0.5, 1.5, 0, 0.5]
df['continous_3'] = [0.5, 20, 1, 115, -0.5, 1.5, 0, 0.5]
df['continous_4'] = [0.5, 20, 1, 115, -0.5, 1.5, 0, 0.5]
df['continous_5'] = [0.5, 20, 1, 115, -0.5, 1.5, 0, 0.5]
df['continous_6'] = [0.5, 20, 1, 115, -0.5, 1.5, 0, 0.5]
df.head(10)

Unnamed: 0,categorical_1,categorical_2,continous_1,continous_2,continous_3,continous_4,continous_5,continous_6
0,-1,1,0.5,0.5,0.5,0.5,0.5,0.5
1,0,2,20.0,20.0,20.0,20.0,20.0,20.0
2,1,3,1.0,1.0,1.0,1.0,1.0,1.0
3,2,4,115.0,115.0,115.0,115.0,115.0,115.0
4,2,5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5
5,2,6,1.5,1.5,1.5,1.5,1.5,1.5
6,2,7,0.0,0.0,0.0,0.0,0.0,0.0
7,2,8,0.5,0.5,0.5,0.5,0.5,0.5


In [4]:
# Helper function
def data_encoder(df, categoricals, continous, manual_embedding=None,
                 fitted_transformers=None, drop_original=True):
    # This function encodes columns for machine learning
    # Categorical features: one-hot encoding, manual embedding
    # Continous features: rescale 0..1, rescale -1..1, (simple) Standard scaler, Robust standard scaler, Euantile transformer (normal dist)
  
    # df: input (pandas) dataframe
    # categoricals: encoding for categorical columns, key = column name, value = encoding type
    # continous: encoding for continous columns, key = column name, value = encoding type
    # manual_embedding: rules for assigning value to value, key = column name, value = rule dictionary
    # fitted_transformers: we want to the same transformation for test data, pass the output of this function on train data
    # drop_original: after encoding a column the original is redundant data which hinder learning
  
    to_be_dropped = []
    transformers = []
    tranformer_index = 0
    for i, column in enumerate(df.columns):
        temp = pd.DataFrame()

        # Check if any categorical transformation needed for column
        if column in categoricals:
            # Use the assigned tranformation on column
            if categoricals[column] == 'one-hot':
                # One-hot encoding
                temp = pd.get_dummies(df[column], prefix=column)
        
            elif categoricals[column] == 'manual_embedding':
                # Manual embedding
                rules = manual_embedding[column]
                dim = len(rules[list(rules.keys())[0]])
                for j in range(dim):
                    # Create columns for all embedding dimension
                    temp[column + '_' + str(j)] = df[column].apply(lambda x: rules[x][j])
            else:
                # Incorrect encoding type in dict
                pass
    
        # Check if any continous transformation needed for column
        if column in continous:
            # Use the assigned tranformation on column
            if continous[column] == 'scale: 0..1':
                # Rescale values to 0..1 range
                if fitted_transformers == None:
                    ct = compose.ColumnTransformer([(column, preprocessing.MinMaxScaler(), [i])])
                else:
                    ct = fitted_transformers[tranformer_index]
                    tranformer_index +=1
                transformers.append(ct)
                temp[column + '_enc'] = ct.fit_transform(df).reshape(-1)
        
            elif continous[column] == 'scale: -1..1':
                # Rescale values to -1..1 range
                if fitted_transformers == None:
                    ct = compose.ColumnTransformer([(column, preprocessing.MinMaxScaler(feature_range=(-1, 1)), [i])])
                else:
                    ct = fitted_transformers[tranformer_index]
                    tranformer_index +=1
                transformers.append(ct)
                temp[column + '_enc'] = ct.fit_transform(df).reshape(-1)
      
            elif continous[column] == 'simple standard scaling':
                # (simple) Standard scaler (substract mean, divide by std)
                if fitted_transformers == None:
                    ct = compose.ColumnTransformer([(column, preprocessing.StandardScaler(), [i])])
                else:
                    ct = fitted_transformers[tranformer_index]
                    tranformer_index +=1
                transformers.append(ct)
                temp[column + '_enc'] = ct.fit_transform(df).reshape(-1)
      
            elif continous[column] == 'robust standard scaling':
                # Robust (against outliers) standard scaler
                if fitted_transformers == None:
                    ct = compose.ColumnTransformer([(column, preprocessing.RobustScaler(), [i])])
                else:
                    ct = fitted_transformers[tranformer_index]
                    tranformer_index +=1
                transformers.append(ct)
                temp[column + '_enc'] = ct.fit_transform(df).reshape(-1)
        
            elif continous[column] == 'quantile transformer':
                # This method transforms the features to follow a uniform or a normal distribution.
                if fitted_transformers == None:
                    ct = compose.ColumnTransformer([(column, preprocessing.QuantileTransformer(output_distribution='normal'), [i])])
                else:
                    ct = fitted_transformers[tranformer_index]
                    tranformer_index +=1
                transformers.append(ct)
                temp[column + '_enc'] = ct.fit_transform(df).reshape(-1)
        
            elif continous[column] == 'power transformer':
                # This method transforms the features to follow a  Gaussian distribution. (Yeo-Johnson or Box-Cox)
                if fitted_transformers == None:
                    ct = compose.ColumnTransformer([(column, preprocessing.PowerTransformer(method='yeo-johnson', standardize=True), [i])])
                else:
                    ct = fitted_transformers[tranformer_index]
                    tranformer_index +=1
                transformers.append(ct)
                temp[column + '_enc'] = ct.fit_transform(df).reshape(-1)
            
            else:
                # Incorrect encoding type in dict
                pass
    
        # Merge result with input dataframe
        df = pd.concat([df, temp], axis=1)

        # Drop data input columns
        if drop_original and ((column in categoricals) or (column in continous)):
            to_be_dropped.append(column)
  
    df = df.drop(columns=to_be_dropped)
    
    return df, transformers

In [5]:
# Which column to encode and how to do it:
categoricals = {'categorical_1':   'one-hot',
                'categorical_2':   'manual_embedding'}

continous = {'continous_1':   'scale: 0..1',
             'continous_2':   'scale: -1..1',
             'continous_3':   'simple standard scaling',
             'continous_4':   'robust standard scaling',
             'continous_5':   'quantile transformer',
             'continous_6':   'power transformer'}

# Manual embedding rules:
side = {1: (0,1),
        2: (np.sqrt(2),np.sqrt(2)),
        3: (1,0),
        4: (np.sqrt(2),-np.sqrt(2)),
        5: (0,-1),
        6: (-np.sqrt(2),-np.sqrt(2)),
        7: (-1,0),
        8: (-np.sqrt(2),np.sqrt(2))}
manual_embedding = {'categorical_2': side}

In [6]:
# Train set encoding
df_encoded, fitted_transformers = data_encoder(df,
                                               categoricals,
                                               continous,
                                               manual_embedding=manual_embedding,
                                               fitted_transformers=None,
                                               drop_original=False)
df_encoded.head(10)

  % (self.n_quantiles, n_samples))


Unnamed: 0,categorical_1,categorical_2,continous_1,continous_2,continous_3,continous_4,continous_5,continous_6,categorical_1_-1,categorical_1_0,categorical_1_1,categorical_1_2,categorical_2_0,categorical_2_1,continous_1_enc,continous_2_enc,continous_3_enc,continous_4_enc,continous_5_enc,continous_6_enc
0,-1,1,0.5,0.5,0.5,0.5,0.5,0.5,1,0,0,0,0.0,1.0,0.008658,-0.982684,-0.446696,-0.043478,-0.366106,-0.334672
1,0,2,20.0,20.0,20.0,20.0,20.0,20.0,0,1,0,0,1.414214,1.414214,0.177489,-0.645022,0.073338,3.347826,1.067571,1.288261
2,1,3,1.0,1.0,1.0,1.0,1.0,1.0,0,0,1,0,1.0,0.0,0.012987,-0.974026,-0.433362,0.043478,0.180012,-0.059789
3,2,4,115.0,115.0,115.0,115.0,115.0,115.0,0,0,0,1,1.414214,-1.414214,1.0,1.0,2.60684,19.869565,5.199338,1.700907
4,2,5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,0,0,0,1,0.0,-1.0,0.0,-1.0,-0.473365,-0.217391,-5.199338,-1.609322
5,2,6,1.5,1.5,1.5,1.5,1.5,1.5,0,0,0,1,-1.414214,-1.414214,0.017316,-0.965368,-0.420028,0.130435,0.565949,0.131852
6,2,7,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,-1.0,0.0,0.004329,-0.991342,-0.460031,-0.130435,-1.067571,-0.782566
7,2,8,0.5,0.5,0.5,0.5,0.5,0.5,0,0,0,1,-1.414214,1.414214,0.008658,-0.982684,-0.446696,-0.043478,-0.366106,-0.334672


In [7]:
# Test set encoding
# Use fitted_transformers from train set encoding
# to apply the same transformation to test set.
df_encoded, fitted_transformers = data_encoder(df,
                                               categoricals,
                                               continous,
                                               manual_embedding=manual_embedding,
                                               fitted_transformers=fitted_transformers,
                                               drop_original=False)
df_encoded.head(10)

  % (self.n_quantiles, n_samples))


Unnamed: 0,categorical_1,categorical_2,continous_1,continous_2,continous_3,continous_4,continous_5,continous_6,categorical_1_-1,categorical_1_0,categorical_1_1,categorical_1_2,categorical_2_0,categorical_2_1,continous_1_enc,continous_2_enc,continous_3_enc,continous_4_enc,continous_5_enc,continous_6_enc
0,-1,1,0.5,0.5,0.5,0.5,0.5,0.5,1,0,0,0,0.0,1.0,0.008658,-0.982684,-0.446696,-0.043478,-0.366106,-0.334672
1,0,2,20.0,20.0,20.0,20.0,20.0,20.0,0,1,0,0,1.414214,1.414214,0.177489,-0.645022,0.073338,3.347826,1.067571,1.288261
2,1,3,1.0,1.0,1.0,1.0,1.0,1.0,0,0,1,0,1.0,0.0,0.012987,-0.974026,-0.433362,0.043478,0.180012,-0.059789
3,2,4,115.0,115.0,115.0,115.0,115.0,115.0,0,0,0,1,1.414214,-1.414214,1.0,1.0,2.60684,19.869565,5.199338,1.700907
4,2,5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,0,0,0,1,0.0,-1.0,0.0,-1.0,-0.473365,-0.217391,-5.199338,-1.609322
5,2,6,1.5,1.5,1.5,1.5,1.5,1.5,0,0,0,1,-1.414214,-1.414214,0.017316,-0.965368,-0.420028,0.130435,0.565949,0.131852
6,2,7,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,-1.0,0.0,0.004329,-0.991342,-0.460031,-0.130435,-1.067571,-0.782566
7,2,8,0.5,0.5,0.5,0.5,0.5,0.5,0,0,0,1,-1.414214,1.414214,0.008658,-0.982684,-0.446696,-0.043478,-0.366106,-0.334672
