In [178]:
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sns
import csv
import re
import sys, os, random

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures

sys.path.insert(0, "/Users/schwalmdaniel/github/xgboost/python-package")
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

%matplotlib inline

# reproducible results
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

pd.set_option('display.max_rows',1000)

def rmse_cv(model, train,y):
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y, scoring="neg_mean_squared_log_error", cv = kf))
    return(rmse)

root = '/Users/schwalmdaniel/github/kaggle/house_prices_kaggle'
#root = 'd:/dev/python/kaggle/titanic'

train=pd.read_csv(root + "/train.csv")
test=pd.read_csv(root + "/test.csv")

# have a look at the ds
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [179]:
def convertToOrdinal(df,colname,mapping,targetDf,targetColname,dropOriginalCol = True):
    mostFrequent = df[colname].value_counts().index[0]
    df[colname].fillna(mostFrequent,inplace=True)
    targetDf[targetColname] = df[colname].apply(lambda x: mapping[x])
    if dropOriginalCol:
        targetDf = df.drop([colname],axis=1) 
        
five_scale = {'Ex': 5,'Gd': 4,'TA':3,'Fa':2,'Po':1}
LandSlope_scale= {'Gtl': 3,'Mod': 2,'Sev':1}
Functional_scale = {'Typ':8,'Min1':7,'Min2':6, 'Mod':5,'Maj1':4,'Maj2':3,'Sev':2,'Sal':1}

In [180]:



convertToOrdinal(train,'BsmtQual',five_scale,train,'BsmtQualOrdinal')
convertToOrdinal(test,'BsmtQual',five_scale,test,'BsmtQualOrdinal')
convertToOrdinal(train,'BsmtCond',five_scale,train,'BsmtCondOrdinal')
convertToOrdinal(test,'BsmtCond',five_scale,test,'BsmtCondOrdinal')

convertToOrdinal(train,'ExterQual',five_scale,train,'ExterQualOrdinal')
convertToOrdinal(test,'ExterQual',five_scale,test,'ExterQualOrdinal')
convertToOrdinal(train,'ExterCond',five_scale,train,'ExterCondOrdinal')
convertToOrdinal(test,'ExterCond',five_scale,test,'ExterCondOrdinal')

convertToOrdinal(train,'KitchenQual',five_scale,train,'KitchenQualOrdinal')
convertToOrdinal(test,'KitchenQual',five_scale,test,'KitchenQualOrdinal')

convertToOrdinal(train,'LandSlope',LandSlope_scale,train,'LandSlopeOrdinal')
convertToOrdinal(test,'LandSlope',LandSlope_scale,test,'LandSlopeOrdinal')

convertToOrdinal(train,'Functional',Functional_scale,train,'FunctionalOrdinal')
convertToOrdinal(test,'Functional',Functional_scale,test,'FunctionalOrdinal')



In [181]:
# to unify values to dummify in both train and test set, we concatenate them
train_objs_num = len(train)
dataset = pd.concat(objs=[train, test], axis=0)
dataset = pd.get_dummies(dataset, columns = ['MSSubClass', 'MSZoning', 'Street','Alley','LotShape','LandContour','Utilities',
                          'LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle',
                         'RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType',
                         'Foundation','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating',
                         'HeatingQC','CentralAir','Electrical','FireplaceQu','GarageType',
                         'GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC',
                         'Fence','MiscFeature','MoSold','SaleType', 'SaleCondition'                 
                         #,'BsmtQual','BsmtCond','ExterQual','ExterCond','KitchenQual' ,'LandSlope','Functional'                     
                         ],  # which columns to dummify 
               prefix_sep='__')
train = dataset[:train_objs_num]
test = dataset[train_objs_num:]
#'BsmtQual'
train.shape


(1460, 299)

In [182]:
train['GarageYrBlt'].fillna(0,inplace=True)
test['GarageYrBlt'].fillna(0,inplace=True)
train['GarageYrBlt'].fillna(0,inplace=True)
test['GarageYrBlt'].fillna(0,inplace=True)
train['MasVnrArea'].fillna(0,inplace=True)
test['MasVnrArea'].fillna(0,inplace=True)
train['LotFrontage'].fillna(train['LotFrontage'].mean(),inplace=True)
test['LotFrontage'].fillna(train['LotFrontage'].mean(),inplace=True)
train['BsmtFinSF1'].fillna(train['BsmtFinSF1'].mean(),inplace=True)
test['BsmtFinSF1'].fillna(train['BsmtFinSF1'].mean(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [183]:
X = train.drop(['Id','SalePrice','BsmtQual','BsmtCond','ExterQual','ExterCond','KitchenQual',
                'LandSlope','Functional'],axis=1,errors='ignore') 
y = train['SalePrice']
X_test = test.drop(['Id','SalePrice','BsmtQual','BsmtCond', 'ExterQual','ExterCond','KitchenQual',
                'LandSlope','Functional'],axis=1,errors='ignore')

In [184]:
def _get_xgb_feat_importances(clf, feature_names):

        try:
            # Handles case when clf has been created by calling
            # xgb.XGBClassifier.fit() or xgb.XGBRegressor().fit()
            fscore = clf.get_booster().get_fscore()
        except:
            # Handles case when clf has been created by calling xgb.train.
            # Thus, clf is an instance of xgb.Booster.
            fscore = clf.get_fscore()

        trained_feature_names = feature_names #self._get_trained_feature_names()

        feat_importances = []
        
        #print(fscore.items())

        features = {}
        
        for k, v in fscore.items():
            features[k] = v
            
        return sorted(features.items(), key=lambda x: x[1])[-50:]

In [186]:
model = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,#2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

#pipeline = Pipeline([('imputer', Imputer(strategy='mean')), ('classify', model)])
pipeline = Pipeline([('classify', model)])
pipeline.fit(X,y)
score = rmse_cv(pipeline, X, y)
print("\pipeline score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

preds = pipeline.predict(X_test)

predicted = pd.DataFrame()
predicted['Id'] = test['Id']
predicted['SalePrice'] = preds
predicted[['Id', 'SalePrice']] = predicted[['Id', 'SalePrice']]
predicted.to_csv(root + '/submission.csv', index=False,quoting=csv.QUOTE_NONNUMERIC)

predicted.head()
#last one: \pipeline score: 0.1222 (0.0088) only basement quality ordinal
#\pipeline score: 0.1233 (0.0110) - exterqual, extercond ordinal


\pipeline score: 0.1230 (0.0100)



Unnamed: 0,Id,SalePrice
0,1461,128431.789062
1,1462,166526.609375
2,1463,189956.453125
3,1464,191607.609375
4,1465,181430.3125


In [187]:
str(list(_get_xgb_feat_importances(model, X.columns.tolist())))

"[('BsmtExposure__No', 46), ('SaleType__WD', 46), ('ExterCondOrdinal', 46), ('GarageFinish__RFn', 46), ('LandSlopeOrdinal', 48), ('BsmtExposure__Av', 48), ('MasVnrType__BrkFace', 50), ('LandContour__HLS', 53), ('LotShape__Reg', 53), ('SaleType__New', 55), ('MoSold__7', 55), ('MoSold__10', 56), ('SaleCondition__Abnorml', 57), ('3SsnPorch', 58), ('Neighborhood__StoneBr', 59), ('BsmtExposure__Gd', 61), ('FunctionalOrdinal', 61), ('ExterQualOrdinal', 62), ('MoSold__12', 64), ('FullBath', 69), ('SaleCondition__Normal', 73), ('BsmtFullBath', 81), ('GarageCars', 82), ('Fireplaces', 83), ('BsmtFinSF2', 86), ('KitchenQualOrdinal', 90), ('SaleCondition__Family', 90), ('BsmtQualOrdinal', 98), ('ScreenPorch', 111), ('BedroomAbvGr', 131), ('EnclosedPorch', 146), ('OverallCond', 159), ('TotRmsAbvGrd', 160), ('YrSold', 172), ('OverallQual', 240), ('YearRemodAdd', 296), ('WoodDeckSF', 332), ('2ndFlrSF', 362), ('GarageYrBlt', 381), ('OpenPorchSF', 385), ('MasVnrArea', 387), ('YearBuilt', 388), ('LotFro

In [None]:
# ordinal: OverallQual, OverallCond
# nan: MasVnrType, MasVnrArea, BsmtQual, BsmtCond, BsmtExposure, Electrical, FireplaceQu
# numeric: MasVnrArea, BsmtFinSF1, BsmtFinSF2, 'BsmtUnfSF', 'TotalBsmtSF','1stFlrSF', 
#         '2ndFlrSF', 'LowQualFinSF', 'GrLivArea'