# Function

In [1]:
import numpy as np
import pandas as pd

def one_hot_encoder(data, col_name):
    
    """
    Encodes col_name according to the one-hot encoding schema.
    The new columns are named as col_name_<cat_value>.
    
    Special cases:
    - if there is no data for the transformation, the original dataframe is returned.
    - if there is missing data in the column, an exception is raised
    
    Parameters
    ----------    
    data: pandas.DataFrame
        Pandas data frame
    col_name: str
        the column name that needs to be encoded
    """
    
    #
    # checking for extreme cases
    #
    
    # if it is an empty dataframe, there is nothing to do
    if len(data) == 0:
        return(data)
        
    # getting the list of unique values
    cat_values = data[col_name].unique().tolist()
    
    # if there is missing data in the column, it is not handled yet
    try:
        if cat_values.index(None) >= 0:
            #return(data)
            raise NotImplementedError('There is missing data in the column.')
    except ValueError:
        pass

    #
    # there is some data, processing it
    #
    
    # copying data except col_name to a new data frame, it will later be returned as the result
    #df = data[data.columns.difference([col_name])].copy()
    df = data.drop(col_name, axis=1).copy()
    
    # array of the one-hot vectors, they are all zeros yet
    one_hot_vectors = np.zeros([len(data), len(cat_values)], dtype=int)

    # finding the index values for the values
    one_hot_indexes = data[col_name].map(lambda x: cat_values.index(x))

    # placing '1' to relevant places in the vectors
    one_hot_vectors[one_hot_indexes.reset_index().index, one_hot_indexes.values]  = 1

    # getting the names of new columns
    col_names = [col_name + '_' + str(value) for value in cat_values]
    
    # combining the new columns with the subset of original dataframe
    df_one_hot = pd.DataFrame(one_hot_vectors, columns=col_names)
    df_one_hot = df_one_hot.set_index(df.index)

    df = pd.concat([df, df_one_hot], axis=1)
    
    return(df)

# Testing

## Case 1: Empty Data Set

In [2]:
dfe = pd.DataFrame([], columns=['column1', 'column2'])
dfe

Unnamed: 0,column1,column2


In [3]:
one_hot_encoder(data=dfe, col_name='column1')

Unnamed: 0,column1,column2


## Case 2: No Values in Column

In [4]:
dfce = pd.DataFrame([[None, 1], [None, 2]], columns=['column1', 'column2'])
dfce

Unnamed: 0,column1,column2
0,,1
1,,2


In [5]:
one_hot_encoder(data=dfce, col_name='column1')

NotImplementedError: There is missing data in the column.

## Case 3: Values Exist

In [None]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv'
# https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv
dft = pd.read_csv(url).sample(10).copy()
dft

In [None]:
one_hot_encoder(data=dft, col_name='cylinders')