## Example Workflow of Data Preprocessing

Nice preprocessing workflow, reimplemented from [here](https://github.com/Far0n/kaggle-homesite).

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('data/train.csv')
test  = pd.read_csv('data/test.csv')

# extract the output column
y_train = train['QuoteConversion_Flag']

# remove the id column from both training and testing 
train = train.drop( ['QuoteNumber', 'QuoteConversion_Flag'], axis = 1 )
test  = test.drop( 'QuoteNumber', axis = 1 )

# store the number of the training data, so we 
# can know where to split the training / testing set after they're
# being concatenated for preprocessing
ntrain = train.shape[0]

# combine the training and testing
train_test = pd.concat( ( train, test ), axis = 0, ignore_index = True )
train_test.shape

(434589, 297)

In [3]:
# dates 
train_test['Date'] = pd.to_datetime( train_test['Original_Quote_Date'] )
train_test['Day']     = train_test['Date'].dt.day
train_test['Year']    = train_test['Date'].dt.year
train_test['Month']   = train_test['Date'].dt.month
train_test['Weekday'] = train_test['Date'].dt.dayofweek
train_test = train_test.drop( ['Date', 'Original_Quote_Date'], axis = 1 )

In [4]:
# some other preprocessing, includinig exclude the commas in numbers and convert it back to int
# fill in empty cells and NA values as -1
train_test['Field10'] = train_test['Field10'].apply( lambda x: x.replace( ',', '' ) ).astype(np.int)
train_test['PropertyField37'] = train_test['PropertyField37'].apply( lambda x: -1 if x == ' ' else x )
train_test['GeographicField63'] = train_test['GeographicField63'].apply( lambda x: -1 if x == ' ' else x )
train_test = train_test.fillna(-1)

In [5]:
# dropped because of too many distinct levels
high_levels = [ "PersonalField16", "PersonalField17", "PersonalField18", "PersonalField19" ]
train_test.drop( high_levels, axis = 1, inplace = True )

In [6]:
# Counting each row's NA values ( in other cases, count outliers ) makes intuitive sense,
# since if an applicant has left many fields blank (or uncodable), 
# then he or she is probably less serious about taking / accepting the quote.
# Side note : specify axis = 1 in .apply applys function along columns, equivalent to saying that
# this will sum up the values of each rows
train_test['NaNCount'] = train_test.apply( lambda x: np.sum( x == -1 ), axis = 1 )

In [7]:
# split the whole dataset back into training and testing
train = train_test.iloc[ :ntrain, : ].copy()
test  = train_test.iloc[ ntrain:, : ].copy()

In [8]:
train.head(6)

Unnamed: 0,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,CoverageField1B,CoverageField2A,...,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64,Day,Year,Month,Weekday,NaNCount
0,B,23,0.9403,0.0006,965,1.02,N,17,23,17,...,18,-1,10,N,CA,16,2013,8,4,15
1,F,7,1.0006,0.004,548,1.2433,N,6,8,6,...,17,-1,20,N,NJ,22,2014,4,1,15
2,F,7,1.0006,0.004,548,1.2433,N,7,12,7,...,11,-1,8,N,NJ,25,2014,8,0,15
3,J,10,0.9769,0.0004,1165,1.2665,N,3,2,3,...,9,-1,21,N,TX,15,2013,4,0,17
4,E,23,0.9472,0.0006,1487,1.3045,N,8,13,8,...,11,-1,12,N,IL,25,2014,1,5,12
5,E,14,0.9472,0.0006,1487,1.3045,N,13,20,13,...,25,-1,14,N,IL,18,2014,1,5,14
