In [1]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
# get pandas
import pandas as pd
# we'll do some visual checks, get the necessary packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
# done, save the model
import pickle

In [23]:
# first load the model
model = pickle.load(open('../models/gbt_combo_reg_v1.pkl', 'rb'))

In [2]:
# then load the dictionary for the transforms
cat_dic = pickle.load(open('../models/gbt_combo_dic_v1.pkl', 'rb'))

In [3]:
# now load the test data
df_raw = pd.read_csv('../data/test.csv')

In [5]:
# wrapper function to perform some simple cleanup
def clean_df(df):
    # drop everything with more than 10% of missing values
    df = df[df.columns[df.isnull().mean() < 0.1]]
    # looks like Garage built goes with Year built, replace NA using that
    df.loc[df['GarageYrBlt'].isnull(),'GarageYrBlt'] = df['YearBuilt']
    return df

In [13]:
def cure_nas(df):
    # get list of categorical variables
    cat_cols = list(df.select_dtypes(include=['object']).columns)
    # get list of non-cat variables
    var_cols = list(df.select_dtypes(include=['int64','float64']).columns)
    # fill na in special case with rule
    # use 1stFlrSF for bsmt when null
    df.loc[df['TotalBsmtSF'].isnull(),'TotalBsmtSF'] = df['1stFlrSF']
    # similar for garage but add factor 1/2
    df.loc[df['GarageArea'].isnull(),'GarageArea'] = df['1stFlrSF']*0.5
    # fill rest of var_cols with 0
    df[var_cols] = df[var_cols].fillna(0)
    # fill na for cats with most frequent
    df[cat_cols] = df[cat_cols].apply(lambda x:x.fillna(x.value_counts().index[0]))    
    return df

In [16]:
df = clean_df(df_raw)
df = cure_nas(df)
df.describe()

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.378341,9819.161069,6.078821,5.553804,1971.357779,1983.662783,99.673749,438.902673,52.583276,...,472.767649,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,421.321334,42.74688,4955.517327,1.436812,1.11374,30.390071,21.130467,177.001792,455.257119,176.698671,...,216.97417,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,1461.0,20.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.0,0.0,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,11517.5,7.0,6.0,2001.0,2004.0,162.0,752.0,0.0,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,1526.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [17]:
# check quickly n/a
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# show columns with most missing entries
missing_data.head(10)

Unnamed: 0,Total,Percent
SaleCondition,0,0.0
YearRemodAdd,0,0.0
RoofMatl,0,0.0
Exterior1st,0,0.0
Exterior2nd,0,0.0
MasVnrType,0,0.0
MasVnrArea,0,0.0
ExterQual,0,0.0
ExterCond,0,0.0
Foundation,0,0.0


In [18]:
# now prep categorical variables
def cat_handler_df(df,cat_dic):
    # get list of categorical variables
    cat_cols = list(df.select_dtypes(include=['object']).columns)
    # Encoding the variables
    df[cat_cols] = df[cat_cols].apply(lambda x: cat_dic[x.name].transform(x))
    # return the df and the dictionary
    return df

In [20]:
# transform cats
df = cat_handler_df(df,cat_dic)

In [21]:
# prep scikit
input_features = list(df.columns)
input_features.remove('Id')
X = df[input_features].values

In [24]:
y = model.predict(X)

In [25]:
# store output
df['log_SalePrice'] = y

In [26]:
# get the original variable 
df['SalePrice'] = np.exp(df['log_SalePrice']) - 1

In [27]:
# drop the log var
df = df.drop(['log_SalePrice'], axis=1)

In [28]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1461,20,2,11622,1,3,3,0,4,0,...,0,0,120,0,0,6,2010,8,4,122665.92859
1,1462,20,3,14267,1,0,3,0,0,0,...,0,0,0,0,12500,6,2010,8,4,153216.144029
2,1463,60,3,13830,1,0,3,0,4,0,...,0,0,0,0,0,3,2010,8,4,178963.489183
3,1464,60,3,9978,1,0,3,0,4,0,...,0,0,0,0,0,6,2010,8,4,188727.656278
4,1465,120,3,5005,1,0,1,0,4,0,...,0,0,144,0,0,1,2010,8,4,193942.886296


In [29]:
# save the file for submission
df[['Id','SalePrice']].to_csv('../data/combo_submission_180225.csv',index=False)