# Function

In [241]:
import numpy as np
import pandas as pd

def one_hot_encoder(data, col_name):
    
    """
    Encodes col_name according to the one-hot encoding schema.
    The new columns are named as col_name_<cat_value>.
    
    Special cases:
    - if there is no data for the transformation, the original dataframe is returned.
    - if there is missing data in the column, an exception is raised
    
    Parameters
    ----------    
    data: pandas.DataFrame
        Pandas data frame
    col_name: str
        the column name that needs to be encoded
    """
    
    #
    # checking for extreme cases
    #
    
    # if it is an empty dataframe, there is nothing to do
    if len(data) == 0:
        return(data)
        
    # getting the list of unique values
    cat_values = data[col_name].unique().tolist()
    
    # if there is missing data in the column, it is not handled yet
    try:
        if cat_values.index(None) >= 0:
            #return(data)
            raise NotImplementedError('There is missing data in the column.')
    except ValueError:
        pass

    #
    # there is some data, processing it
    #
    
    # copying data except col_name to a new data frame, it will later be returned as the result
    #df = data[data.columns.difference([col_name])].copy()
    df = data.drop(col_name, axis=1).copy()
    
    # array of the one-hot vectors, they are all zeros yet
    one_hot_vectors = np.zeros([len(data), len(cat_values)], dtype=int)

    # finding the index values for the values
    one_hot_indexes = data[col_name].map(lambda x: cat_values.index(x))

    # placing '1' to relevant places in the vectors
    one_hot_vectors[one_hot_indexes.reset_index().index, one_hot_indexes.values]  = 1

    # getting the names of new columns
    col_names = [col_name + '_' + str(value) for value in cat_values]
    
    # combining the new columns with the subset of original dataframe
    df_one_hot = pd.DataFrame(one_hot_vectors, columns=col_names)
    df_one_hot = df_one_hot.set_index(df.index)

    df = pd.concat([df, df_one_hot], axis=1)
    
    return(df)

# Testing

## Case 1: Empty Data Set

In [242]:
dfe = pd.DataFrame([], columns=['column1', 'column2'])
dfe

Unnamed: 0,column1,column2


In [243]:
one_hot_encoder(data=dfe, col_name='column1')

Unnamed: 0,column1,column2


## Case 2: No Values in Column

In [244]:
dfce = pd.DataFrame([[None, 1], [None, 2]], columns=['column1', 'column2'])
dfce

Unnamed: 0,column1,column2
0,,1
1,,2


In [245]:
one_hot_encoder(data=dfce, col_name='column1')

NotImplementedError: There is missing data in the column.

## Case 3: Values Exist

In [246]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv'
# https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv
dft = pd.read_csv(url).sample(10).copy()
dft

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
158,16.0,8,318.0,150.0,4498,14.5,75,usa,plymouth grand fury
73,13.0,8,307.0,130.0,4098,14.0,72,usa,chevrolet chevelle concours (sw)
317,34.3,4,97.0,78.0,2188,15.8,80,europe,audi 4000
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
204,32.0,4,85.0,70.0,1990,17.0,76,japan,datsun b-210
374,23.0,4,151.0,,3035,20.5,82,usa,amc concord dl
205,28.0,4,97.0,75.0,2155,16.4,76,japan,toyota corolla
235,26.0,4,97.0,75.0,2265,18.2,77,japan,toyota corolla liftback
243,21.5,3,80.0,110.0,2720,13.5,77,japan,mazda rx-4
257,19.4,6,232.0,90.0,3210,17.2,78,usa,amc concord


In [248]:
one_hot_encoder(data=dft, col_name='cylinders')

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,model_year,origin,name,cylinders_8,cylinders_4,cylinders_3,cylinders_6
158,16.0,318.0,150.0,4498,14.5,75,usa,plymouth grand fury,1,0,0,0
73,13.0,307.0,130.0,4098,14.0,72,usa,chevrolet chevelle concours (sw),1,0,0,0
317,34.3,97.0,78.0,2188,15.8,80,europe,audi 4000,0,1,0,0
394,44.0,97.0,52.0,2130,24.6,82,europe,vw pickup,0,1,0,0
204,32.0,85.0,70.0,1990,17.0,76,japan,datsun b-210,0,1,0,0
374,23.0,151.0,,3035,20.5,82,usa,amc concord dl,0,1,0,0
205,28.0,97.0,75.0,2155,16.4,76,japan,toyota corolla,0,1,0,0
235,26.0,97.0,75.0,2265,18.2,77,japan,toyota corolla liftback,0,1,0,0
243,21.5,80.0,110.0,2720,13.5,77,japan,mazda rx-4,0,0,1,0
257,19.4,232.0,90.0,3210,17.2,78,usa,amc concord,0,0,0,1
