In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabaz_score

from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, mean_squared_log_error, make_scorer
from sklearn.pipeline import Pipeline

pd.set_option('display.max_columns', None)

# This folder has the final model
# It is a gradient boosting regressor using vector quintization

In [2]:
# Import dataset
df = pd.read_csv("train.csv")

In [3]:
# Define sections
lot = [
    "LotFrontage",
    "LotArea"
]

major = [
    "OverallQual",
    "OverallCond",
    "YearBuilt",
    "YearRemodAdd",
]

basement = [
    "BsmtFinSF1",
    "BsmtUnfSF",
    "TotalBsmtSF"
]

floors = [
    "1stFlrSF",
    "2ndFlrSF",
    "GrLivArea"
]

rooms = [
    "BedroomAbvGr",
    "KitchenAbvGr",
    "TotRmsAbvGrd",
    "Fireplaces",
]

bathrooms = [
    "BsmtFullBath",
    "FullBath",
    "HalfBath",
]

garage = [
    "GarageYrBlt",
    "GarageCars",
]
    
outdoors = [
    "WoodDeckSF",
    "OpenPorchSF",
    "EnclosedPorch",
    "ScreenPorch",
    "PoolArea",
    "MasVnrArea"
]

year = [
    "GarageYrBlt",
    "YearBuilt",
    "YearRemodAdd",
]

therest = np.concatenate([lot, major, garage, year])


In [4]:
# Make a pipeline to prepare data 
from sklearn.base import BaseEstimator, TransformerMixin
class FillNanValues(BaseEstimator, TransformerMixin):
    """
    Gets a DataFrame describing houses and fill the NaNs with meaningful replacements
    """
    def __init__(self): pass
    def fit(self, X, y=None):
        self.LotFrontageMedian = X.LotFrontage.median()
        self.MasVnrAreaMedian = X.MasVnrArea.median()
        return self
    def transform(self, X):
        X.LotFrontage.fillna(self.LotFrontageMedian, inplace=True)
        X.MasVnrArea.fillna(self.MasVnrAreaMedian, inplace=True)
        X.MiscFeature.fillna("NoShed", inplace=True)
        X.Fence.fillna('NoFence', inplace=True)
        X.BsmtQual.fillna("NoBasement", inplace=True)
        X.BsmtCond.fillna("NoBasement", inplace=True)
        X.BsmtExposure.fillna("NoBasement", inplace=True)
        X.BsmtFinType1.fillna("NoBasement", inplace=True)
        X.BsmtFinType2.fillna("NoBasement", inplace=True)
        X.BsmtFinType2.fillna("NoBasement", inplace=True)
        X.MasVnrType.fillna("None", inplace=True)
        X.Electrical.fillna("SBrkr", inplace=True)
        X.FireplaceQu.fillna('NoFP', inplace=True)
        X.GarageType.fillna('Attchd', inplace=True)
        X.GarageFinish.fillna('Unf', inplace=True)
        X.GarageQual.fillna('TA', inplace=True)
        X.GarageCond.fillna('TA', inplace=True)
        X.PoolQC.fillna("NoPool", inplace=True)
        return X

class RemoveColumns(BaseEstimator, TransformerMixin):
    """
    Get a DataFrame describing houses and removes irrelevant, repetative, or corrupt data
    Decisions based on EDA found in ML_Final_EDA.ipynb
    """
    def __init__(self): pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(
        ['GarageArea', 'BsmtHalfBath', 'MiscVal', 'MoSold',
        'YrSold', 'BsmtFinSF2', 'LowQualFinSF', '3SsnPorch', 
        'Utilities', 'Exterior2nd', 'Alley', 'Id'],
        axis = 1)

class AlterAttributes(BaseEstimator, TransformerMixin):
    """
    Get a DataFrame describing houses and alter some attributes 
    Decisions based on EDA found in ML_Final_EDA.ipynb
    """
    def __init__(self): pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #Combine GarageCars == 3 and GarageCars == 4
        X.GarageCars = X.GarageCars.map(lambda x: 3 if (x == 4) else x)

        #Fill Nans in GarageYrBlt with median year
        X.GarageYrBlt = X.GarageYrBlt.fillna(X.GarageYrBlt.median())

        #Combine KitchenAbvGr 1 and 0 AND 2 and 3 such that it is binary
        X.KitchenAbvGr = X.KitchenAbvGr.map(lambda x: 0 if (x == 1) else x)
        X.KitchenAbvGr = X.KitchenAbvGr.map(lambda x: 1 if (x == 2 or x == 3) else x)

        #Make Pool Area binary (x>0 or x==0)--> 1 if they have pool else 0
        X.PoolArea = X.PoolArea.map(lambda x: 1 if (x>0) else x)
        return X

class GetDummies(BaseEstimator, TransformerMixin):
    """
    Get a DataFrame describing houses and convert all categorical attributes to one-hot encoded columns
    """
    def __init__(self): pass
    def fit(self, X, y=None):
        self.PriorData = pd.get_dummies(pd.DataFrame(X))
        self.columns = self.PriorData.columns
        return self
    def transform(self, X):
        temp = pd.get_dummies(X)
        for col in self.columns:
            if col not in temp.columns:
                temp_ar = np.zeros(temp.shape[0])
                for i in range(len(temp_ar)):
                    temp_ar[i]=self.PriorData[col].median()
                temp[col] = temp_ar
                
        for i,j in zip(temp.isnull().sum(), temp.columns):
            if i>0:
                temp[j].fillna(temp[j].median(), inplace=True)
        return pd.get_dummies(temp)

class ScaleData(BaseEstimator, TransformerMixin):
    """
    Get a DataFrame, center & scale it using the mean and standard devision 
    """
    def __init__(self): pass
    def fit(self, X, y=None):
        self.PriorData = X
        return self
    def transform(self, X):
        return (X-self.PriorData.mean())/self.PriorData.std()    

class GetDataFrame(BaseEstimator, TransformerMixin):
    """
    Get any data object and convert it to a Pandas DataFrame
    """
    def __init__(self): pass
    def fit(self, X, y=None):
        self.columns = pd.DataFrame(X).columns
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns).fillna(0)

class VectorQuantization(BaseEstimator, TransformerMixin):
    """
    Creates cluster models of groups of related numerical attributes
    Adds these clusters as categorical attributes to the train and test data in tranform
    Grouping, reasoning, and descriptions of clusters shown in ML_Final_VQ.ipynb
    """
    def __init__(self): pass
    def fit(self, X, y=None):
        
        self.basement_cm = KMeans(n_clusters=3).fit(X[basement])
        self.outdoors_cm = KMeans(n_clusters=5).fit(X[outdoors])
        self.rooms_cm = KMeans(n_clusters=3).fit(X[rooms])
        self.floors_cm = KMeans(n_clusters=4).fit(X[floors])
        self.therest_cm = KMeans(n_clusters=2).fit(X[therest])
        self.cols = ['b1', 'b2', 'b3', 'o1', 'o2', 'o3', 'o4', 'o5', 'r1', 'r2', 'r3', 
               'f1', 'f2', 'f3', 'f4', 't1', 't2']
        
        return self
    def transform(self, X):

        temp_b = pd.get_dummies(self.basement_cm.predict(X[basement]))
        temp_b.columns = temp_b.columns.map(lambda x: "b"+str(x+1))
        temp_b.index = X.index

        temp_o = pd.get_dummies(self.outdoors_cm.predict(X[outdoors]))
        temp_o.columns = temp_o.columns.map(lambda x: "o"+str(x+1))
        temp_o.index = X.index
        
        temp_r = pd.get_dummies(self.rooms_cm.predict(X[rooms]))
        temp_r.columns = temp_r.columns.map(lambda x: "r"+str(x+1))
        temp_r.index = X.index
        
        temp_f = pd.get_dummies(self.floors_cm.predict(X[floors]))
        temp_f.columns = temp_f.columns.map(lambda x: "f"+str(x+1))
        temp_f.index = X.index
        
        temp_t = pd.get_dummies(self.therest_cm.predict(X[therest]))
        temp_t.columns = temp_t.columns.map(lambda x: "t"+str(x+1))
        temp_t.index = X.index
        
        #return pd.concat([temp_b, temp_o, temp_r, temp_f, temp_t, X], axis=1)
        ret = pd.concat([temp_b,temp_o, temp_r, temp_f, temp_t,  X], axis=1)
        for col in self.cols:
            if col not in ret.columns:
                print col
                ret[col] = np.zeros(len(ret))
        return ret
        

Prep = Pipeline([
    ('fillnanvalues', FillNanValues()),
    ('removecolumns', RemoveColumns()),
    ('alterattributes', AlterAttributes()),
    ('getdummies', GetDummies()),
    ('scaledata', ScaleData()),
    ('getdataframe', GetDataFrame()),
    ('vq', VectorQuantization())
])

In [5]:
# Test pipeline fit
print "Number of Nans:",Prep.fit_transform(df.drop(['SalePrice'], axis=1)[0:1000]).isnull().sum().sum()
Prep.fit_transform(df.drop(['SalePrice'], axis=1)[0:1000]).head()

Number of Nans: 0


Unnamed: 0,b1,b2,b3,o1,o2,o3,o4,o5,r1,r2,r3,f1,f2,f3,f4,t1,t2,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,PoolArea,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Ex,ExterCond_Fa,ExterCond_Gd,ExterCond_Po,ExterCond_TA,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_NoBasement,BsmtQual_TA,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_NoBasement,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_NoBasement,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NoBasement,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_NoBasement,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasA,Heating_GasW,Heating_Grav,Heating_Wall,HeatingQC_Ex,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_N,CentralAir_Y,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Maj1,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,FireplaceQu_Ex,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFP,FireplaceQu_Po,FireplaceQu_TA,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_Fin,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_N,PavedDrive_P,PavedDrive_Y,PoolQC_Ex,PoolQC_Fa,PoolQC_NoPool,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_NoFence,MiscFeature_Gar2,MiscFeature_NoShed,MiscFeature_Othr,MiscFeature_Shed,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0.073835,-0.235233,-0.196376,0.632583,-0.529353,1.04529,0.868153,0.463803,0.589487,-0.939202,-0.486584,-0.80208,1.157694,0.389437,1.086206,0.787209,1.229838,0.178127,-0.219476,0.935421,-0.938659,1.00523,0.320923,-0.782629,0.196077,-0.352788,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,-0.718981,-0.18464,-0.054827,0.777516,-0.201351,-0.19315,-0.163301,0.335014,-0.477779,-0.263532,-0.178773,-0.044744,0.629485,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,2.981961,-0.175775,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,-0.160048,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,-0.089758,-0.18464,-0.2388,0.399916,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,0.441076,-0.153356,-0.19315,-0.172732,-0.284605,-0.331313,-0.110153,-0.993521,-0.077654,-0.089758,1.484225,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,1.353155,-0.408282,-0.139099,-0.110153,1.457009,-1.198993,-0.290713,-0.206664,-0.100454,1.42096,-1.271292,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,-0.329452,-0.878793,1.11171,-0.119099,-0.063341,-0.031623,-0.310475,-0.156734,1.14608,-0.156734,-0.871667,-0.181727,-0.226877,-0.156734,-0.044744,0.345978,-0.418226,-0.327586,-0.286651,0.728603,-0.160048,-0.428075,-0.325712,1.563914,-0.243439,-0.156734,-0.312405,-0.638782,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,0.993521,-0.16964,-0.458741,-0.031623,-0.644992,-0.269979,0.269979,-0.267842,-0.139099,-0.054827,-0.031623,0.314328,-0.280482,-0.166498,1.23957,-1.011567,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,-0.575523,1.044489,-0.123342,-0.526177,-0.063341,0.72539,-0.09525,-0.252519,-0.077654,-0.601694,-0.56321,1.575384,-0.945011,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,-0.269979,-0.063341,-0.100454,-0.119099,0.479356,-0.316243
1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0,-0.872768,0.475727,-0.095611,-0.090369,2.176029,0.145528,-0.454579,-0.575947,1.204322,-0.637972,0.475625,0.279964,-0.7926,-0.485057,-0.819419,0.787209,-0.773156,0.178127,-0.219476,-0.307663,0.600126,-0.11552,0.320923,1.613018,-0.701133,-0.352788,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,-0.718981,-0.18464,-0.054827,0.777516,-0.201351,-0.19315,-0.163301,0.335014,-0.477779,-0.263532,5.588095,-0.044744,-1.587012,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,-0.335014,-0.175775,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,-0.160048,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,11.12996,-0.18464,4.18342,-2.498025,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,0.441076,-0.153356,-0.19315,-0.172732,-0.284605,-0.331313,-0.110153,1.005515,-0.077654,-0.089758,-0.673079,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,2.398124,-0.294736,-0.131441,-0.738275,-0.408282,-0.139099,-0.110153,-0.685651,0.833199,-0.290713,-0.206664,-0.100454,-0.703046,0.785815,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,-0.329452,1.136787,-0.898616,-0.119099,-0.063341,-0.031623,-0.310475,-0.156734,1.14608,-0.156734,-0.871667,-0.181727,-0.226877,-0.156734,-0.044744,0.345978,-0.418226,3.049585,-0.286651,-1.371117,-0.160048,2.333701,-0.325712,-0.638782,-0.243439,-0.156734,-0.312405,-0.638782,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,0.993521,-0.16964,-0.458741,-0.031623,-0.644992,-0.269979,0.269979,-0.267842,-0.139099,-0.054827,-0.031623,0.314328,-0.280482,-0.166498,-0.805925,0.987577,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,-0.575523,-0.956448,-0.123342,1.898601,-0.063341,0.72539,-0.09525,-0.252519,-0.077654,-0.601694,-0.56321,1.575384,-0.945011,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,-0.269979,-0.063341,-0.100454,-0.119099,0.479356,-0.316243
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0.073835,-0.093041,0.048965,0.632583,-0.529353,0.978641,0.819163,0.283438,0.092194,-0.300774,-0.334906,-0.631511,1.185098,0.537789,1.086206,0.787209,1.229838,0.178127,-0.219476,-0.307663,0.600126,0.922212,0.320923,-0.782629,-0.083382,-0.352788,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,1.389467,-0.18464,-0.054827,-1.284861,-0.201351,-0.19315,-0.163301,0.335014,-0.477779,-0.263532,-0.178773,-0.044744,0.629485,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,2.981961,-0.175775,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,-0.160048,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,-0.089758,-0.18464,-0.2388,0.399916,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,0.441076,-0.153356,-0.19315,-0.172732,-0.284605,-0.331313,-0.110153,-0.993521,-0.077654,-0.089758,1.484225,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,1.353155,-0.408282,-0.139099,-0.110153,1.457009,-1.198993,-0.290713,-0.206664,-0.100454,1.42096,-1.271292,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,-0.329452,-0.878793,1.11171,-0.119099,-0.063341,-0.031623,-0.310475,-0.156734,1.14608,-0.156734,-0.871667,-0.181727,-0.226877,-0.156734,-0.044744,0.345978,-0.418226,-0.327586,3.485073,-1.371117,-0.160048,-0.428075,-0.325712,1.563914,-0.243439,-0.156734,-0.312405,-0.638782,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,0.993521,-0.16964,-0.458741,-0.031623,-0.644992,-0.269979,0.269979,-0.267842,-0.139099,-0.054827,-0.031623,0.314328,-0.280482,-0.166498,1.23957,-1.011567,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,-0.575523,-0.956448,-0.123342,1.898601,-0.063341,0.72539,-0.09525,-0.252519,-0.077654,-0.601694,-0.56321,1.575384,-0.945011,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,-0.269979,-0.063341,-0.100454,-0.119099,0.479356,-0.316243
3,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0.310486,-0.47222,-0.099992,0.632583,-0.529353,-1.887267,-0.748519,-0.575947,-0.518121,-0.062487,-0.723581,-0.522241,0.933889,0.403101,1.086206,-1.026637,-0.773156,0.178127,-0.219476,0.313879,0.600126,0.797684,1.675026,-0.782629,-0.18634,4.128523,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,1.389467,-0.18464,-0.054827,-1.284861,-0.201351,-0.19315,-0.163301,0.335014,2.090925,-0.263532,-0.178773,-0.044744,-1.587012,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,-0.335014,5.683397,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,-0.160048,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,-0.089758,-0.18464,-0.2388,0.399916,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,0.441076,-0.153356,-0.19315,-0.172732,-0.284605,-0.331313,-0.110153,-0.993521,-0.077654,-0.089758,1.484225,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,-0.738275,2.446838,-0.139099,-0.110153,-0.685651,0.833199,-0.290713,-0.206664,-0.100454,-0.703046,0.785815,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,3.032306,-0.878793,-0.898616,-0.119099,-0.063341,-0.031623,-0.310475,-0.156734,-0.871667,-0.156734,1.14608,-0.181727,4.403266,-0.156734,-0.044744,-2.887464,-0.418226,-0.327586,-0.286651,0.728603,-0.160048,2.333701,-0.325712,-0.638782,-0.243439,-0.156734,-0.312405,-0.638782,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,-1.005515,-0.16964,2.1777,-0.031623,-0.644992,-0.269979,0.269979,-0.267842,-0.139099,-0.054827,-0.031623,0.314328,-0.280482,-0.166498,1.23957,-1.011567,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,1.735813,-0.956448,-0.123342,-0.526177,-0.063341,-1.37719,-0.09525,-0.252519,-0.077654,1.660313,-0.56321,-0.634131,1.057131,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,3.700294,-0.063341,-0.100454,-0.119099,-2.084046,-0.316243
4,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0.073835,0.665317,0.312707,1.355536,-0.529353,0.945316,0.721183,1.28075,0.474205,-0.174887,0.198338,-0.031856,1.612153,1.342011,1.086206,0.787209,1.229838,1.40659,-0.219476,1.556963,0.600126,0.880702,1.675026,0.760875,0.534369,-0.352788,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,1.389467,-0.18464,-0.054827,-1.284861,-0.201351,-0.19315,-0.163301,0.335014,-0.477779,-0.263532,5.588095,-0.044744,-1.587012,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,-0.335014,-0.175775,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,6.241875,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,-0.089758,-0.18464,-0.2388,0.399916,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,0.441076,-0.153356,-0.19315,-0.172732,-0.284605,-0.331313,-0.110153,-0.993521,-0.077654,-0.089758,1.484225,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,1.353155,-0.408282,-0.139099,-0.110153,1.457009,-1.198993,-0.290713,-0.206664,-0.100454,1.42096,-1.271292,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,-0.329452,-0.878793,1.11171,-0.119099,-0.063341,-0.031623,-0.310475,-0.156734,1.14608,-0.156734,-0.871667,-0.181727,-0.226877,-0.156734,-0.044744,0.345978,2.38866,-0.327586,-0.286651,-1.371117,-0.160048,-0.428075,-0.325712,1.563914,-0.243439,-0.156734,-0.312405,-0.638782,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,0.993521,-0.16964,-0.458741,-0.031623,-0.644992,-0.269979,0.269979,-0.267842,-0.139099,-0.054827,-0.031623,0.314328,-0.280482,-0.166498,1.23957,-1.011567,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,-0.575523,-0.956448,-0.123342,1.898601,-0.063341,0.72539,-0.09525,-0.252519,-0.077654,-0.601694,-0.56321,1.575384,-0.945011,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,-0.269979,-0.063341,-0.100454,-0.119099,0.479356,-0.316243


In [6]:
# Test pipeline transform
print "Number of Nans:",Prep.transform(df.drop(['SalePrice'], axis=1)[1000:]).isnull().sum().sum()
Prep.transform(df.drop(['SalePrice'], axis=1)[1000:]).head()

Number of Nans: 0


Unnamed: 0,b1,b2,b3,o1,o2,o3,o4,o5,r1,r2,r3,f1,f2,f3,f4,t1,t2,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,PoolArea,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Ex,ExterCond_Fa,ExterCond_Gd,ExterCond_Po,ExterCond_TA,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_NoBasement,BsmtQual_TA,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_NoBasement,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_NoBasement,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_NoBasement,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_NoBasement,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_GasA,Heating_GasW,Heating_Grav,Heating_Wall,HeatingQC_Ex,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_N,CentralAir_Y,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Maj1,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,FireplaceQu_Ex,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFP,FireplaceQu_Po,FireplaceQu_TA,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageFinish_Fin,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageQual_TA,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,GarageCond_TA,PavedDrive_N,PavedDrive_P,PavedDrive_Y,PoolQC_Ex,PoolQC_Fa,PoolQC_NoPool,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_NoFence,MiscFeature_Gar2,MiscFeature_NoShed,MiscFeature_Othr,MiscFeature_Shed,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1000,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,-0.872768,0.191343,-0.042512,-2.259227,-2.332941,-0.65426,-1.630341,-0.575947,-1.006372,-1.276401,-2.51528,-0.567548,-0.7926,-1.105792,-0.819419,-1.026637,-0.773156,-1.050336,-0.219476,-1.550748,-0.938659,-0.945706,0.320923,-0.782629,-0.701133,-0.352788,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,-0.718981,-0.18464,-0.054827,0.777516,-0.201351,-0.19315,-0.163301,0.335014,2.090925,-0.263532,-0.178773,-0.044744,-1.587012,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,-0.335014,-0.175775,3.822352,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,-0.160048,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,-0.089758,-0.18464,-0.2388,0.399916,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,0.441076,-0.153356,-0.19315,-0.172732,-0.284605,-0.331313,-0.110153,1.005515,-0.077654,-0.089758,-0.673079,-0.163301,-0.204022,9.477298,-1.915522,-0.089758,-0.482507,-0.077654,-7.181918,-0.031623,-0.031623,10.488136,-0.054827,-0.070853,-0.114709,31.591154,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,-0.738275,-0.408282,-0.139099,-0.110153,-0.685651,0.833199,-0.290713,-0.206664,-0.100454,-0.703046,0.785815,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,-0.329452,-0.878793,-0.898616,8.387976,-0.063341,-0.031623,-0.310475,-0.156734,-0.871667,6.373853,-0.871667,-0.181727,-0.226877,6.373853,-0.044744,-2.887464,-0.418226,-0.327586,-0.286651,-1.371117,6.241875,-0.428075,-0.325712,-0.638782,-0.243439,6.373853,-0.312405,-0.638782,-0.131441,-0.163301,-0.110153,-0.178773,6.241875,-0.178773,-2.45687,-7.382479,9.069234,-0.063341,-0.044744,-1.005515,5.888936,-0.458741,-0.031623,-0.644992,3.700294,-3.700294,-0.267842,7.181918,-0.054827,-0.031623,-3.178207,-0.280482,6.000083,-0.805925,-1.011567,-0.083918,-0.063341,6.66409,-0.149908,-0.110153,-0.031623,-3.700294,-0.123342,-0.163301,-0.575523,1.044489,-0.123342,-0.526177,-0.063341,-1.37719,-0.09525,-0.252519,-0.077654,1.660313,-0.56321,-0.634131,1.057131,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,6.996499,-0.077654,-0.077654,-5.327603,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,-0.269979,-0.063341,-0.100454,-0.119099,0.479356,-0.316243
1001,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,-0.636117,-0.47222,-0.463623,-0.813322,0.372441,-1.720644,-1.728321,-0.575947,-1.006372,0.276959,-0.877629,-1.241827,-0.7926,-1.599647,-0.819419,-1.026637,-0.773156,-1.050336,-0.219476,-1.550748,-0.938659,-2.44004,-1.033181,-0.782629,-0.406966,1.1959,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,-0.718981,-0.18464,-0.054827,0.777516,-0.201351,-0.19315,-0.163301,0.335014,2.090925,-0.263532,-0.178773,-0.044744,-1.587012,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,-0.335014,-0.175775,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,-0.160048,-0.25475,3.412707,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,-0.089758,-0.18464,-0.2388,0.399916,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,0.441076,-0.153356,-0.19315,-0.172732,-0.284605,-0.331313,-0.110153,1.005515,-0.077654,-0.089758,-0.673079,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,-0.738275,2.446838,-0.139099,-0.110153,-0.685651,0.833199,-0.290713,-0.206664,-0.100454,-0.703046,0.785815,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,3.032306,-0.878793,-0.898616,-0.119099,-0.063341,-0.031623,-0.310475,6.373853,-0.871667,-0.156734,-0.871667,-0.181727,-0.226877,-0.156734,-0.044744,0.345978,-0.418226,-0.327586,-0.286651,0.728603,-0.160048,-0.428075,-0.325712,-0.638782,-0.243439,-0.156734,-0.312405,1.563914,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,0.993521,-0.16964,-0.458741,-0.031623,-0.644992,-0.269979,0.269979,3.729805,-0.139099,-0.054827,-0.031623,-3.178207,3.561732,-0.166498,-0.805925,-1.011567,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,-0.575523,1.044489,-0.123342,-0.526177,-0.063341,-1.37719,-0.09525,-0.252519,-0.077654,1.660313,-0.56321,-0.634131,1.057131,-0.054827,5.783535,-0.089758,-0.054827,-4.715247,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,3.921486,-0.146387,-3.344238,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,3.700294,-0.063341,-0.100454,-0.119099,-2.084046,-0.316243
1002,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,-0.872768,0.238741,0.110914,1.355536,-0.529353,1.145263,1.015123,-0.29479,-0.952122,2.20798,1.215056,1.111486,-0.7926,0.123965,-0.819419,0.787209,-0.773156,0.178127,-0.219476,0.313879,0.600126,1.129758,1.675026,0.374999,0.828536,-0.352788,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,1.389467,-0.18464,-0.054827,-1.284861,-0.201351,-0.19315,-0.163301,0.335014,-0.477779,-0.263532,-0.178773,-0.044744,0.629485,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,-0.335014,-0.175775,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,-0.160048,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,3.921486,-0.139099,-0.160048,-0.089758,-0.18464,-0.2388,-2.498025,-0.083918,-0.114709,-0.089758,7.600375,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,0.441076,-0.153356,-0.19315,-0.172732,-0.284605,-0.331313,-0.110153,1.005515,-0.077654,-0.089758,-0.673079,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,1.353155,-0.408282,-0.139099,-0.110153,1.457009,-1.198993,-0.290713,-0.206664,-0.100454,1.42096,-1.271292,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,-0.329452,-0.878793,1.11171,-0.119099,-0.063341,-0.031623,-0.310475,-0.156734,1.14608,-0.156734,-0.871667,-0.181727,-0.226877,-0.156734,-0.044744,0.345978,-0.418226,-0.327586,-0.286651,0.728603,-0.160048,-0.428075,-0.325712,1.563914,-0.243439,-0.156734,-0.312405,-0.638782,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,0.993521,-0.16964,-0.458741,-0.031623,-0.644992,-0.269979,0.269979,-0.267842,-0.139099,-0.054827,-0.031623,0.314328,-0.280482,-0.166498,1.23957,-1.011567,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,1.735813,-0.956448,-0.123342,-0.526177,-0.063341,0.72539,-0.09525,-0.252519,-0.077654,-0.601694,-0.56321,1.575384,-0.945011,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,-0.269979,-0.063341,-0.100454,-0.119099,0.479356,-0.316243
1003,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0.783787,0.001754,0.070871,-0.813322,0.372441,0.145528,-0.454579,0.294048,-1.006372,2.500219,1.466273,1.39399,-0.7926,0.330877,-0.819419,0.787209,-0.773156,1.40659,4.551746,0.935421,-0.938659,-0.11552,0.320923,-0.782629,-0.701133,-0.352788,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,1.389467,-0.18464,-0.054827,-1.284861,-0.201351,-0.19315,-0.163301,0.335014,2.090925,-0.263532,-0.178773,-0.044744,-1.587012,0.241128,-0.214427,-0.10541,-0.10541,-0.044744,-0.100454,-0.206664,-0.153356,-0.335014,-0.175775,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,4.658921,-0.160048,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,-0.089758,-0.18464,4.18342,-2.498025,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,-9.069234,-0.031623,-0.044744,-0.044744,-2.264914,-0.153356,5.172137,-0.172732,-0.284605,-0.331313,-0.110153,1.005515,-0.077654,-0.089758,-0.673079,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,1.353155,-0.408282,-0.139099,-0.110153,1.457009,-1.198993,-0.290713,-0.206664,-0.100454,-0.703046,0.785815,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,-0.329452,1.136787,-0.898616,-0.119099,-0.063341,-0.031623,-0.310475,-0.156734,-0.871667,-0.156734,1.14608,-0.181727,-0.226877,-0.156734,-0.044744,0.345978,-0.418226,-0.327586,-0.286651,0.728603,-0.160048,-0.428075,-0.325712,-0.638782,-0.243439,-0.156734,-0.312405,1.563914,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,-1.005515,5.888936,-0.458741,-0.031623,-0.644992,-0.269979,0.269979,-0.267842,-0.139099,-0.054827,-0.031623,0.314328,-0.280482,-0.166498,-0.805925,0.987577,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,-0.575523,1.044489,-0.123342,-0.526177,-0.063341,-1.37719,-0.09525,-0.252519,-0.077654,1.660313,-0.56321,-0.634131,1.057131,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,-0.269979,-0.063341,-0.100454,-0.119099,0.479356,-0.316243
1004,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,1.493739,-1.277975,-0.657968,0.632583,-0.529353,1.111939,1.015123,-0.49107,-0.970206,1.713423,0.674703,0.924926,-0.7926,-0.012674,-0.819419,0.787209,-0.773156,-2.278798,-0.219476,0.313879,0.600126,1.088249,0.320923,0.471468,-0.701133,-0.352788,-0.271419,-0.044744,-0.089758,-0.229301,-0.10541,0.529272,-0.419874,-0.063341,0.063341,-0.718981,-0.18464,-0.054827,0.777516,-0.201351,-0.19315,-0.163301,0.335014,-0.477779,-0.263532,-0.178773,-0.044744,0.629485,0.241128,-0.214427,-0.10541,9.477298,-0.044744,-0.100454,-0.206664,-0.153356,-0.335014,-0.175775,-0.261357,-0.231705,-0.172732,-0.100454,-0.18464,-0.423162,-0.077654,-0.214427,-0.160048,-0.25475,-0.292729,-0.119099,-0.241128,-0.219476,-0.25475,-0.139099,-0.160048,-0.089758,-0.18464,-0.2388,0.399916,-0.083918,-0.114709,-0.089758,-0.131441,-0.044744,-0.063341,-0.031623,-0.077654,0.110153,-0.031623,-0.044744,-0.044744,-2.264914,-0.153356,-0.19315,-0.172732,3.510128,-0.331313,-0.110153,1.005515,-0.077654,-0.089758,-0.673079,-0.163301,-0.204022,-0.10541,0.521529,-0.089758,-0.482507,-0.077654,0.139099,-0.031623,-0.031623,-0.09525,-0.054827,-0.070853,-0.114709,-0.031623,-0.172732,-0.211866,-0.42644,-0.416576,-0.294736,-0.131441,1.353155,-0.408282,-0.139099,-0.110153,1.457009,-1.198993,-0.290713,-0.206664,-0.100454,1.42096,-1.271292,-0.044744,-0.149908,-0.338692,-0.031623,0.382939,-0.329452,-0.878793,1.11171,-0.119099,-0.063341,-0.031623,-0.310475,-0.156734,1.14608,-0.156734,-0.871667,-0.181727,-0.226877,-0.156734,-0.044744,0.345978,-0.418226,-0.327586,-0.286651,0.728603,-0.160048,-0.428075,-0.325712,1.563914,-0.243439,-0.156734,-0.312405,-0.638782,-0.131441,-0.163301,-0.110153,-0.178773,-0.160048,-0.178773,0.406615,0.13532,-0.110153,-0.063341,-0.044744,0.993521,-0.16964,-0.458741,-0.031623,-0.644992,-0.269979,0.269979,-0.267842,-0.139099,-0.054827,-0.031623,0.314328,-0.280482,-0.166498,1.23957,-1.011567,-0.083918,-0.063341,-0.149908,-0.149908,-0.110153,-0.031623,0.269979,-0.123342,-0.163301,1.735813,-0.956448,-0.123342,-0.526177,-0.063341,0.72539,-0.09525,-0.252519,-0.077654,-0.601694,1.773761,-0.634131,-0.945011,-0.054827,-0.172732,-0.089758,-0.054827,0.211866,-0.044744,-0.142786,-0.077654,-0.077654,0.187514,-0.25475,-0.146387,0.298723,-0.031623,-0.031623,0.044744,-0.206664,-0.19315,-0.349589,-0.089758,0.490361,-0.031623,0.211866,-0.044744,-0.204022,-0.175775,-0.044744,-0.044744,-0.083918,-0.070853,-0.063341,-0.312405,-0.031623,0.403272,-0.269979,-0.063341,-0.100454,-0.119099,0.479356,-0.316243


In [10]:
# Test new pipeline with VQ using GridSearch
GradB = Pipeline([
    ('fillnanvalues', FillNanValues()),
    ('removecolumns', RemoveColumns()),
    ('alterattributes', AlterAttributes()),
    ('getdummies', GetDummies()),
    ('scaledata', ScaleData()),
    ('getdataframe', GetDataFrame()),
    ('vq', VectorQuantization()),
    #('prep', Prep),
    ('bag', GradientBoostingRegressor(loss = "huber", n_estimators=10000, 
                                      learning_rate=0.01, subsample = 0.7))
])

param_grid = [
    {
    'bag__n_estimators' : [5000, 15000],
    'bag__learning_rate' : [0.005, 0.01]
    }
]

grid_search2 = GridSearchCV(GradB, param_grid, cv=3, scoring=make_scorer(mean_squared_log_error))
grid_search2.fit(df.drop(["SalePrice"], axis=1), df.SalePrice)
zip(grid_search2.cv_results_["mean_test_score"],grid_search2.cv_results_["params"])

[(0.015508957574300937,
  {'bag__learning_rate': 0.005, 'bag__n_estimators': 5000}),
 (0.015314397324750441,
  {'bag__learning_rate': 0.005, 'bag__n_estimators': 15000}),
 (0.015185721327898272,
  {'bag__learning_rate': 0.01, 'bag__n_estimators': 5000}),
 (0.015908829112004856,
  {'bag__learning_rate': 0.01, 'bag__n_estimators': 15000})]

In [12]:
param_grid = [
    {
    'bag__n_estimators' : [3000, 5000, 8000],
    'bag__subsample' : [0.5, 0.7]
    }
]

grid_search3 = GridSearchCV(GradB, param_grid, cv=3, scoring=make_scorer(mean_squared_log_error))
grid_search3.fit(df.drop(["SalePrice"], axis=1), df.SalePrice)
zip(grid_search3.cv_results_["mean_test_score"],grid_search3.cv_results_["params"])

[(0.015004922601778773, {'bag__n_estimators': 3000, 'bag__subsample': 0.5}),
 (0.015481031870297698, {'bag__n_estimators': 3000, 'bag__subsample': 0.7}),
 (0.014808123055112418, {'bag__n_estimators': 5000, 'bag__subsample': 0.5}),
 (0.015158121147832514, {'bag__n_estimators': 5000, 'bag__subsample': 0.7}),
 (0.014842026574314115, {'bag__n_estimators': 8000, 'bag__subsample': 0.5}),
 (0.015445529997098369, {'bag__n_estimators': 8000, 'bag__subsample': 0.7})]

In [13]:
best_prior = Pipeline([
    ('prep', Prep), 
    ('bag', GradientBoostingRegressor(n_estimators=30000, loss='huber', subsample=0.4, learning_rate=0.003))
])

res = cross_val_score(best_prior, df.drop("SalePrice", axis=1), df.SalePrice, cv=3, scoring=make_scorer(mean_squared_log_error))
res, res.mean()

(array([ 0.01227163,  0.01775844,  0.01441157]), 0.014813881658951131)

In [14]:
param_grid = [
    {
    'bag__n_estimators' : [4000, 5000, 6000],
    'bag__subsample' : [0.4, 0.5, 0.6]
    }
]

grid_search3 = GridSearchCV(GradB, param_grid, cv=3, scoring=make_scorer(mean_squared_log_error))
grid_search3.fit(df.drop(["SalePrice"], axis=1), df.SalePrice)
zip(grid_search3.cv_results_["mean_test_score"],grid_search3.cv_results_["params"])

[(0.015237653281931541, {'bag__n_estimators': 4000, 'bag__subsample': 0.4}),
 (0.015295136156809271, {'bag__n_estimators': 4000, 'bag__subsample': 0.5}),
 (0.015081905360496504, {'bag__n_estimators': 4000, 'bag__subsample': 0.6}),
 (0.015098785682537665, {'bag__n_estimators': 5000, 'bag__subsample': 0.4}),
 (0.015019082846065073, {'bag__n_estimators': 5000, 'bag__subsample': 0.5}),
 (0.015296365104728786, {'bag__n_estimators': 5000, 'bag__subsample': 0.6}),
 (0.015202639952146827, {'bag__n_estimators': 6000, 'bag__subsample': 0.4}),
 (0.014821759635148779, {'bag__n_estimators': 6000, 'bag__subsample': 0.5}),
 (0.015143890395072818, {'bag__n_estimators': 6000, 'bag__subsample': 0.6})]

In [16]:
# Test new pipeline with VQ using GridSearch
GradB = Pipeline([
    ('fillnanvalues', FillNanValues()),
    ('removecolumns', RemoveColumns()),
    ('alterattributes', AlterAttributes()),
    ('getdummies', GetDummies()),
    ('scaledata', ScaleData()),
    ('getdataframe', GetDataFrame()),
    ('vq', VectorQuantization()),
    ('bag', GradientBoostingRegressor(loss = "huber", subsample = 0.4))
])

param_grid = [
    {
    'bag__n_estimators' : [30000, 40000],
    'bag__learning_rate' : [0.005]
    }
]

grid_search2 = GridSearchCV(GradB, param_grid, cv=3, scoring=make_scorer(mean_squared_log_error))
grid_search2.fit(df.drop(["SalePrice"], axis=1), df.SalePrice)
zip(grid_search2.cv_results_["mean_test_score"],grid_search2.cv_results_["params"])

[(0.015298638082986163,
  {'bag__learning_rate': 0.005, 'bag__n_estimators': 30000}),
 (0.01513798969914, {'bag__learning_rate': 0.005, 'bag__n_estimators': 40000})]

In [17]:
# Test new pipeline with VQ using GridSearch
GradB = Pipeline([
    ('fillnanvalues', FillNanValues()),
    ('removecolumns', RemoveColumns()),
    ('alterattributes', AlterAttributes()),
    ('getdummies', GetDummies()),
    ('scaledata', ScaleData()),
    ('getdataframe', GetDataFrame()),
    ('vq', VectorQuantization()),
    ('bag', GradientBoostingRegressor(loss = "huber", subsample = 0.4))
])

param_grid = [
    {
    'bag__n_estimators' : [6000, 5000],
    'bag__learning_rate' : [0.01]
    }
]

grid_search2 = GridSearchCV(GradB, param_grid, cv=3, scoring=make_scorer(mean_squared_log_error))
grid_search2.fit(df.drop(["SalePrice"], axis=1), df.SalePrice)
zip(grid_search2.cv_results_["mean_test_score"],grid_search2.cv_results_["params"])

[(0.014927500349581537,
  {'bag__learning_rate': 0.01, 'bag__n_estimators': 6000}),
 (0.014875054467291154,
  {'bag__learning_rate': 0.01, 'bag__n_estimators': 5000})]

# Try it on the test data and make a submission to Kaggle

In [23]:
# Train on entire training dataset 
GradB = Pipeline([
    ('fillnanvalues', FillNanValues()),
    ('removecolumns', RemoveColumns()),
    ('alterattributes', AlterAttributes()),
    ('getdummies', GetDummies()),
    ('scaledata', ScaleData()),
    ('getdataframe', GetDataFrame()),
    ('vq', VectorQuantization()),
    ('bag', GradientBoostingRegressor(loss = "huber", n_estimators=5000, 
                                      learning_rate=0.01, subsample = 0.5))
])

GradB.fit(df.drop(['SalePrice'],axis=1), df.SalePrice)

df_test = pd.read_csv("test.csv")
test_predictions = GradB.predict(df_test)

pd.DataFrame(test_predictions).describe()

Unnamed: 0,0
count,1459.0
mean,178994.369959
std,76870.257315
min,29797.3381
25%,128703.06728
50%,159361.828896
75%,209153.641442
max,552710.57005


In [24]:
df_test_predictions = pd.DataFrame([df_test.Id.astype(int) , test_predictions]).T
df_test_predictions.columns = ['Id', 'SalePrice']
df_test_predictions['Id'] = df_test_predictions['Id'].astype(int)
df_test_predictions.to_csv("Submission_4.csv", index=False)

In [25]:
df_test_predictions

Unnamed: 0,Id,SalePrice
0,1461,124775.032907
1,1462,160998.441022
2,1463,191384.275497
3,1464,195265.891045
4,1465,185887.363412
5,1466,176827.108412
6,1467,170561.388105
7,1468,168301.015634
8,1469,186598.901114
9,1470,131437.300694


In [26]:
# Train on entire training dataset 
GradB = Pipeline([
    ('fillnanvalues', FillNanValues()),
    ('removecolumns', RemoveColumns()),
    ('alterattributes', AlterAttributes()),
    ('getdummies', GetDummies()),
    ('scaledata', ScaleData()),
    ('getdataframe', GetDataFrame()),
    ('vq', VectorQuantization()),
    ('bag', GradientBoostingRegressor(loss = "huber", n_estimators=15000, 
                                      learning_rate=0.005, subsample = 0.5))
])

GradB.fit(df.drop(['SalePrice'],axis=1), df.SalePrice)

df_test = pd.read_csv("test.csv")
test_predictions = GradB.predict(df_test)

pd.DataFrame(test_predictions).describe()
df_test_predictions = pd.DataFrame([df_test.Id.astype(int) , test_predictions]).T
df_test_predictions.columns = ['Id', 'SalePrice']
df_test_predictions['Id'] = df_test_predictions['Id'].astype(int)
df_test_predictions.to_csv("Submission_4.csv", index=False)
df_test_predictions

Unnamed: 0,Id,SalePrice
0,1461,126345.894856
1,1462,162078.950351
2,1463,192516.416455
3,1464,194003.475232
4,1465,181182.524853
5,1466,175102.195826
6,1467,170923.069909
7,1468,168364.628547
8,1469,183888.423042
9,1470,130344.707008
