Define the following functions for processing different types of columns in structured datasets:

process_missing_labels(mdf, column)

process_numerical_class(mdf, column)

process_binary_class(mdf, column, missing)

process_text_class(mdf, column)



In [166]:
#First we address imports. These are all lifted directly from Titanic tutorial
#also removed those titanic imports not used in our code
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn import datasets, svm

In [167]:
# import training data to pandas dataframe
df = pd.read_csv("train.csv") 

# view the first five rows
df[:5]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [168]:
#expanded bare minimum set to prep for manipulations
df_bm = df.loc[:,['Neighborhood','GrLivArea','YearBuilt','OverallQual','CentralAir','SalePrice','FullBath','HalfBath','BsmtFullBath','BsmtHalfBath','TotalBsmtSF','BsmtUnfSF','Fireplaces','MasVnrType','RoofMatl']]
df_bm[:5]

Unnamed: 0,Neighborhood,GrLivArea,YearBuilt,OverallQual,CentralAir,SalePrice,FullBath,HalfBath,BsmtFullBath,BsmtHalfBath,TotalBsmtSF,BsmtUnfSF,Fireplaces,MasVnrType,RoofMatl
0,CollgCr,1710,2003,7,Y,208500,2,1,1,0,856,150,0,BrkFace,CompShg
1,Veenker,1262,1976,6,Y,181500,2,0,0,1,1262,284,1,,CompShg
2,CollgCr,1786,2001,7,Y,223500,2,1,1,0,920,434,1,BrkFace,CompShg
3,Crawfor,1717,1915,7,Y,140000,1,0,1,0,756,540,1,,CompShg
4,NoRidge,2198,2000,8,Y,250000,2,1,1,0,1145,490,1,BrkFace,CompShg


In [169]:
# function to process training data \
# by deleting rows where labeling data is missing

# process_labels(mdf, column)
# function to delete rows in training data where \
# label column has missing data
# takes as arguement a pandas dataframe (mdf), \
# and the name of the column string ('column') for labels
# deletes rows with missing values in label column
# returns transformed dataframe


def process_labels(mdf, column):
    
    # drop rows with missing data in training variable
    mdf = mdf.dropna(subset=[column]) 
    
    return mdf


In [170]:
# function to process and normalize numerical columns \
# using standard scalar approach and mean value for missing data


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

# process_numerical_class(mdf, column)
# function to normalize data to mean of 0 and standard deviation of 1
# takes as arguement a pandas dataframe (mdf), \
# and the name of the column string ('column') 
# replaces original specified column in dataframe
# returns transformed dataframe

# assumes all values are either numbers or NaN (no text)


def process_numerical_class(mdf, column):
     
    #deal with missing values in numerical columns using scikit learn Imputer
    imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
    imp.fit([mdf[column]])
    mdf[column] = pd.Series(imp.transform([mdf[column]]).tolist()[0])
    
    #normalize column using standard scalar approach (feature scaling)
    scaler = StandardScaler()
    mdf[[column]] = scaler.fit_transform(mdf[[column]])
    
    return mdf   

In [171]:
# function to encode binary text classification columns to 0/1

from sklearn import preprocessing

# process_binary_class(mdf, column, missing)
# function to convert binary classification problems to a 0 or 1
# takes as arguement a pandas dataframe (mdf), \
# the name of the column string ('column') \
# and the string classification to assign to missing data ('missing')
# replaces original specified column in dataframe
#returns transformed dataframe

# missing category must be identical to one ot the two existing categories


def process_binary_class(mdf, column, missing):
    
    
    # if missing argument is not one of the existing classifications, \
    # return error message
    if missing not in mdf[column].unique():
        print('ERROR: assignment for missing value in process_binary_class() call not found in existing data')
        return mdf
    
    #replace missing data with specified classification
    mdf[column] = mdf[column].fillna(missing)
    
    # if more than two remaining classifications, return error message    
    if len(mdf[column].unique()) > 2:
        print('ERROR: number of categories in column for process_binary_class() call >2')
        return mdf
    
    #convert column to binary 0/1 classification
    lb = preprocessing.LabelBinarizer()
    mdf[column] = lb.fit_transform(mdf[column])
    
    return mdf
   

In [172]:
# function to encode textual categories to sparse vectors \
# and append to dataframe

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from random import randint


#process_text_class(mdf, column)
#function to process coloumn with text classification
#takes as arguement a pandas dataframe (mdf), \
#and the name of the column string ('column') \
#deletes the original column from master dataframe and \
#replaces with onehot encodings \
#with columns named after text classifications \
#missing data replaced with category label 'missing'+column
#returns transformed dataframe

#assumes that all categories are present in training data \
#that will be found in test data


def process_text_class(mdf, column):


    
    #replace NA with a dummy variable
    mdf[column] = mdf[column].fillna('missing'+column)
    
    #if no missing data, assign dummy 'missing' classification to \
    #single random row
    #just in case there is missing data in test data
    #(so one hot encoding maintains consistent number of categories)

    #from random import randint
    if 'missing'+column not in mdf[column]:
        randomrow = randint(1, mdf.shape[0])
        mdf.set_value(randomrow, column, 'missing'+column)
      
    #extract categories for column labels
    labels = mdf[column].unique()
    
    #transform text classifications to numerical id
    #from sklearn.preprocessing import LabelEncoder
    
    encoder = LabelEncoder()
    cat = mdf[column]
    cat_encoded = encoder.fit_transform(cat)
    
    #apply onehotencoding
    #from sklearn.preprocessing import OneHotEncoder
    
    encoder = OneHotEncoder()
    cat_1hot = encoder.fit_transform(cat_encoded.reshape(-1,1))
    
    #convert sparse array to pandas dataframe with column labels
    df_cat = pd.DataFrame(cat_1hot.toarray(), columns=labels)
    
    #concatinate the sparse set with the rest of our training data
    mdf = pd.concat([df_cat, mdf], axis=1)
    
    #delete original column from training data
    del mdf[column]
    
    return mdf
    
    #madataframe[:5]

In [173]:
# process_labels() tests

#df_bm = process_labels(df_bm, 'SalePrice')
#df_bm.shape

#df = process_labels(df, 'Alley')
#df.shape


In [174]:
# process_numerical_class() tests

#df_bm = process_numerical_class(df_bm, 'GrLivArea')
#df_bm[:5]

In [175]:
# process_binary_class() tests

#error call tests:

#df_bm = process_binary_class(df_bm, 'CentralAir', 'Q')
#df_bm[:5]

#df_bm = process_binary_class(df_bm, 'Neighborhood', 'CollgCr')
#df_bm[:5]

#function test

#df_bm = process_binary_class(df_bm, 'CentralAir', 'Y')
#df_bm[:5]

In [176]:
# process_text_class() tests:

#df_bm = process_text_class(df_bm, 'Neighborhood')
#df_bm[:5]

#df_bm = process_text_class(df_bm, 'MasVnrType')
#df_bm[:5]



Now that we have defined our funcations for data preprocessing of structures data, we'll begin a new notebook incorporating these functions and applying them to the processing and training of the Kaggle house price regression data set. To be continued in next notebook.