Define the following functions for preprocessing different categories of data in structured datasets. Each function takes as input pandas dataframe data sets and the name of column to be processed. Once defined the functions will be tested / demonstrated.

process_numerical_class(mdf_train, mdf_test, column)

process_binary_class(mdf, column, missing)

process_text_class(mdf_train, mdf_test, column)

These are meant to deal with some potential errors such as missing data, although other conceivable errors such as incompatible numeric vs string input are not addressed.

# Define Functions

In [35]:
#imports
import numpy as np
import pandas as pd
from pandas import Series
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [36]:
#process_numerical_class(mdf_train, mdf_test, column)
#function to normalize data to mean of 0 and standard deviation of 1 from training distribution
#takes as arguement pandas dataframe of training and test data (mdf_train), (mdf_test)\
#and the name of the column string ('column') 
#replaces missing or improperly formatted data with mean of remaining values
#replaces original specified column in dataframe
#returns transformed dataframe

#expect this approach works better when the numerical distribution is thin tailed
#if only have training but not test data handy, use same training data for both dataframe inputs



def process_numerical_class(mdf_train, mdf_test, column):
     
    
    #convert all values to either numeric or NaN
    mdf_train[column] = pd.to_numeric(mdf_train[column], errors='coerce')
    mdf_test[column] = pd.to_numeric(mdf_test[column], errors='coerce')

    #get mean of training data
    mean = mdf_train[column].mean()    
    
    #replace missing data with training set mean
    mdf_train[column] = mdf_train[column].fillna(mean)
    mdf_test[column] = mdf_test[column].fillna(mean)
    
    #subtract mean from column for both train and test
    mdf_train[column] = mdf_train[column] - mean
    mdf_test[column] = mdf_test[column] - mean
    
    #get standard deviation of training data
    std = mdf_train[column].std()
    
    #divide column values by std for both training and test data
    mdf_train[column] = mdf_train[column] / std
    mdf_test[column] = mdf_test[column] / std

    
    return mdf_train, mdf_test


In [37]:
#process_binary_class(mdf, column, missing)
#converts binary classification values to 0 or 1
#takes as arguement a pandas dataframe (mdf), \
#the name of the column string ('column') \
#and the string classification to assign to missing data ('missing')
#replaces original specified column in dataframe
#returns transformed dataframe

#missing category must be identical to one of the two existing categories
#returns error message if more than two categories remain


def process_binary_class(mdf, column, missing):
    
    #replace missing data with specified classification
    mdf[column] = mdf[column].fillna(missing)
    
    #if more than two remaining classifications, return error message    
    if len(mdf[column].unique()) > 2:
        print('ERROR: number of categories in column for process_binary_class() call >2')
        return mdf
    
    #convert column to binary 0/1 classification
    lb = preprocessing.LabelBinarizer()
    mdf[column] = lb.fit_transform(mdf[column])
    
    return mdf
   

In [38]:

#process_text_class(mdf_train, mdf_test, column)
#preprocess column with text classifications
#takes as arguement two pandas dataframe containing training and test data respectively 
#(mdf_train, mdf_test), and the name of the column string ('column')

#note this trains both training and test data simultaneously due to unique treatment if any category
#missing from training set but not from test set to ensure consistent formatting 

#deletes the original column from master dataframe and
#replaces with onehot encodings
#with columns named after column_ + text classifications
#missing data replaced with category label 'missing'+column
#any categories missing from the training set removed from test set
#any category present in training but missing from test set given a column of zeros for consistent formatting
#ensures order of all new columns consistent between both sets
#returns two transformed dataframe (mdf_train, mdf_test)

#if only have training but not test data handy, use same training data for both dataframe inputs


def process_text_class(mdf_train, mdf_test, column):

    #replace NA with a dummy variable
    mdf_train[column] = mdf_train[column].fillna('_missing')
    mdf_test[column] = mdf_test[column].fillna('_missing')

    
    #extract categories for column labels
    #note that .unique() extracts the labels as a numpy array
    labels_train = mdf_train[column].unique()
    labels_train.sort(axis=0)
    labels_test = mdf_test[column].unique()
    labels_test.sort(axis=0)
    
    #transform text classifications to numerical id
    encoder = LabelEncoder()
    cat_train = mdf_train[column]
    cat_train_encoded = encoder.fit_transform(cat_train)
    
    cat_test = mdf_test[column]
    cat_test_encoded = encoder.fit_transform(cat_test)
    
    
    #apply onehotencoding
    onehotencoder = OneHotEncoder()
    cat_train_1hot = onehotencoder.fit_transform(cat_train_encoded.reshape(-1,1))
    cat_test_1hot = onehotencoder.fit_transform(cat_test_encoded.reshape(-1,1))
    
    #append column header name to each category listing
    #note the iteration is over a numpy array hence the [...] approach
    labels_train[...] = column + '_' + labels_train[...]
    labels_test[...] = column + '_' + labels_test[...]
    
    
    #convert sparse array to pandas dataframe with column labels
    df_train_cat = pd.DataFrame(cat_train_1hot.toarray(), columns=labels_train)
    df_test_cat = pd.DataFrame(cat_test_1hot.toarray(), columns=labels_test)

    
    #Get missing columns in test set that are present in training set
    missing_cols = set( df_train_cat.columns ) - set( df_test_cat.columns )
    #Add a missing column in test set with default value equal to 0
    for c in missing_cols:
        df_test_cat[c] = 0
    #Ensure the order of column in the test set is in the same order than in train set
    #Note this also removes categories in test set that aren't present in training set
    df_test_cat = df_test_cat[df_train_cat.columns]
    
    
    #concatinate the sparse set with the rest of our training data
    mdf_train = pd.concat([df_train_cat, mdf_train], axis=1)
    mdf_test = pd.concat([df_test_cat, mdf_test], axis=1)
    

    #delete original column from training data
    del mdf_train[column]    
    del mdf_test[column]
    
    
    return mdf_train, mdf_test
 

# Test Functions

In [39]:
#create sample test and train data for demonstration purposes

#train data set from list of dictionaries
train = [{'number': 1, 'Y/N': 'Y', 'shape': 'circle', 'label': 'cat'}, 
         {'number': 2, 'Y/N': 'N', 'shape': 'square', 'label': 'dog'}, 
         {'number': None, 'Y/N': 'Y', 'shape': 'circle', 'label': 'cat'}, 
         {'number': 3.1, 'Y/N': None, 'shape': 'square', 'label': 'cat'}, 
         {'number': -1, 'Y/N': 'N', 'shape': None, 'label': 'dog'}, 
         {'number': 'Q', 'Y/N': 'N', 'shape': 'oval', 'label': 'dog'}]

#convert train data to pandas dataframe
train = pd.DataFrame(train)

#test data set from list of dictionaries
test = [{'number': 2.1, 'Y/N': 'N', 'shape': 'square'}, 
        {'number': -1, 'Y/N': 'N', 'shape': None},
        {'number': 1, 'Y/N': 'Y', 'shape': 'circle'}, 
        {'number': None, 'Y/N': 'Y', 'shape': 'square'}, 
        {'number': 3, 'Y/N': None, 'shape': 'circle'}, 
        {'number': 0, 'Y/N': 'N', 'shape': 'octogon'}, 
        {'number': 'Q', 'Y/N': 'Y', 'shape': 'square'}]

#convert test data to pandas dataframe
test = pd.DataFrame(test)


In [40]:
#seperate labels from train data
labels = train['label']
del train['label']

train

Unnamed: 0,Y/N,number,shape
0,Y,1,circle
1,N,2,square
2,Y,,circle
3,,3.1,square
4,N,-1,
5,N,Q,oval


In [41]:
#test process_numerical_class()
train, test = process_numerical_class(train, test, 'number')
train

Unnamed: 0,Y/N,number,shape
0,Y,-0.20376,circle
1,N,0.537184,square
2,Y,0.0,circle
3,,1.352223,square
4,N,-1.685648,
5,N,0.0,oval


In [42]:
#test process_binary_class()
train = process_binary_class(train, 'Y/N', 'Y')
test = process_binary_class(test, 'Y/N', 'Y')
train


Unnamed: 0,Y/N,number,shape
0,1,-0.20376,circle
1,0,0.537184,square
2,1,0.0,circle
3,1,1.352223,square
4,0,-1.685648,
5,0,0.0,oval


In [43]:
#test process_text_class():
train, test = process_text_class(train, test, 'shape')
train


Unnamed: 0,shape__missing,shape_circle,shape_oval,shape_square,Y/N,number
0,0.0,1.0,0.0,0.0,1,-0.20376
1,0.0,0.0,0.0,1.0,0,0.537184
2,0.0,1.0,0.0,0.0,1,0.0
3,0.0,0.0,0.0,1.0,1,1.352223
4,1.0,0.0,0.0,0.0,0,-1.685648
5,0.0,0.0,1.0,0.0,0,0.0


In [44]:
test

Unnamed: 0,shape__missing,shape_circle,shape_oval,shape_square,Y/N,number
0,0.0,0.0,0,1.0,0,0.611279
1,1.0,0.0,0,0.0,0,-1.685648
2,0.0,1.0,0,0.0,1,-0.20376
3,0.0,0.0,0,1.0,1,0.0
4,0.0,1.0,0,0.0,1,1.278128
5,0.0,0.0,0,0.0,0,-0.944704
6,0.0,0.0,0,1.0,1,0.0


Now that we have defined our funcations for data preprocessing of structures data, we'll begin a new notebook incorporating these functions and applying them to the processing and training of the Kaggle house price regression data set. To be continued in next notebook.