In [None]:
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sns
import csv
import re
import sys, os, random

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures

#sys.path.insert(0, "/Users/schwalmdaniel/github/xgboost/python-package")
sys.path.insert(0, "e:/xgboost/python-package")
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

%matplotlib inline

# reproducible results
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

pd.set_option('display.max_rows',1000)

def rmse_cv(model, train,y):
    n_folds = 5
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y, scoring="neg_mean_squared_log_error", cv = kf))
    return(rmse)

#root = '/Users/schwalmdaniel/github/kaggle/house_prices_kaggle'
root = 'e:/kaggle/house_prices_kaggle'

train=pd.read_csv(root + "/train.csv")
test=pd.read_csv(root + "/test.csv")

# have a look at the ds
train.head()

In [None]:
def convertToOrdinal(df,colname,mapping,targetDf,targetColname,dropOriginalCol = True):
    mostFrequent = df[colname].value_counts().index[0]
    df[colname].fillna(mostFrequent,inplace=True)
    targetDf[targetColname] = df[colname].apply(lambda x: mapping[x])
    if dropOriginalCol:
        targetDf = df.drop([colname],axis=1) 
        
five_scale = {'Ex': 5,'Gd': 4,'TA':3,'Fa':2,'Po':1}
LandSlope_scale= {'Gtl': 3,'Mod': 2,'Sev':1}
Functional_scale = {'Typ':8,'Min1':7,'Min2':6, 'Mod':5,'Maj1':4,'Maj2':3,'Sev':2,'Sal':1}

In [None]:
convertToOrdinal(train,'BsmtQual',five_scale,train,'BsmtQualOrdinal')
convertToOrdinal(test,'BsmtQual',five_scale,test,'BsmtQualOrdinal')
convertToOrdinal(train,'BsmtCond',five_scale,train,'BsmtCondOrdinal')
convertToOrdinal(test,'BsmtCond',five_scale,test,'BsmtCondOrdinal')

convertToOrdinal(train,'ExterQual',five_scale,train,'ExterQualOrdinal')
convertToOrdinal(test,'ExterQual',five_scale,test,'ExterQualOrdinal')
convertToOrdinal(train,'ExterCond',five_scale,train,'ExterCondOrdinal')
convertToOrdinal(test,'ExterCond',five_scale,test,'ExterCondOrdinal')

convertToOrdinal(train,'KitchenQual',five_scale,train,'KitchenQualOrdinal')
convertToOrdinal(test,'KitchenQual',five_scale,test,'KitchenQualOrdinal')

convertToOrdinal(train,'LandSlope',LandSlope_scale,train,'LandSlopeOrdinal')
convertToOrdinal(test,'LandSlope',LandSlope_scale,test,'LandSlopeOrdinal')

convertToOrdinal(train,'Functional',Functional_scale,train,'FunctionalOrdinal')
convertToOrdinal(test,'Functional',Functional_scale,test,'FunctionalOrdinal')

convertToOrdinal(train,'HeatingQC',five_scale,train,'HeatingQCOrdinal')
convertToOrdinal(test,'HeatingQC',five_scale,test,'HeatingQCOrdinal')

convertToOrdinal(train,'KitchenQual',five_scale,train,'KitchenQualOrdinal')
convertToOrdinal(test,'KitchenQual',five_scale,test,'KitchenQualOrdinal')

convertToOrdinal(train,'FireplaceQu',five_scale,train,'FireplaceQuOrdinal')
convertToOrdinal(test,'FireplaceQu',five_scale,test,'FireplaceQuOrdinal')

convertToOrdinal(train,'GarageQual',five_scale,train,'GarageQualOrdinal')
convertToOrdinal(test,'GarageQual',five_scale,test,'GarageQualOrdinal')

convertToOrdinal(train,'GarageCond',five_scale,train,'GarageCondOrdinal')
convertToOrdinal(test,'GarageCond',five_scale,test,'GarageCondOrdinal')

convertToOrdinal(train,'PoolQC',five_scale,train,'PoolQCOrdinal')
convertToOrdinal(test,'PoolQC',five_scale,test,'PoolQCOrdinal')

train['CentralAir'] = train['CentralAir'].apply(lambda x : 0 if x == 'N' else 1)
test['CentralAir'] = test['CentralAir'].apply(lambda x : 0 if x == 'N' else 1)

In [None]:
train['HasShed'] = train['MiscFeature'].apply(lambda x: 1 if x == 'Shed' else 0)
test['HasShed'] = test['MiscFeature'].apply(lambda x: 1 if x == 'Shed' else 0)

In [None]:
def getStoryCount(name):
    if name == '1Story':
        return 1.0
    elif name == '2Story':
        return 2.0
    elif name in ('1.5Fin','1.5Unf'):
        return 1.5
    elif name in ('2.5Fin','2.5Unf'):
        return 2.5
    else:
        return 0.5

train['StoryCount'] = train['HouseStyle'].apply(lambda x: getStoryCount(x))
test['StoryCount'] = test['HouseStyle'].apply(lambda x: getStoryCount(x))

In [None]:
train['LotLivingRatio'] = 1 - train['1stFlrSF'] / train['LotArea']
test['LotLivingRatio'] = 1 - test['1stFlrSF'] / test['LotArea']

In [None]:
# to unify values to dummify in both train and test set, we concatenate them
train_objs_num = len(train)
dataset = pd.concat(objs=[train, test], axis=0)
dataset = pd.get_dummies(dataset, columns = ['MSSubClass', 'MSZoning', 'Street','Alley','LotShape','LandContour','Utilities',
                          'LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle',
                         'RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType',
                         'Foundation','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating',
                         'Electrical','GarageType',
                         'GarageFinish','PavedDrive',
                         'Fence','MoSold','SaleType', 'SaleCondition'                 
                         #,'BsmtQual','BsmtCond','ExterQual','ExterCond','KitchenQual' ,'LandSlope','Functional', 'HeatingQC'
                        # 'CentralAir',
                         ],  # which columns to dummify 
               prefix_sep='__')
train = dataset[:train_objs_num]
test = dataset[train_objs_num:]
#'BsmtQual'
train.shape


In [None]:
train['GarageYrBlt'].fillna(0,inplace=True)
test['GarageYrBlt'].fillna(0,inplace=True)
test['BsmtFullBath'].fillna(0,inplace=True)
test['BsmtFinSF2'].fillna(0,inplace=True)
test['BsmtHalfBath'].fillna(0,inplace=True)
test['BsmtUnfSF'].fillna(0,inplace=True)
test['GarageArea'].fillna(0,inplace=True)
test['GarageCars'].fillna(0,inplace=True)
test['TotalBsmtSF'].fillna(0,inplace=True)
train['MasVnrArea'].fillna(train['MasVnrArea'].mean(),inplace=True)
test['MasVnrArea'].fillna(train['MasVnrArea'].mean(),inplace=True)
train['LotFrontage'].fillna(train['LotFrontage'].mean(),inplace=True)
test['LotFrontage'].fillna(train['LotFrontage'].mean(),inplace=True)
train['BsmtFinSF1'].fillna(train['BsmtFinSF1'].mean(),inplace=True)
test['BsmtFinSF1'].fillna(train['BsmtFinSF1'].mean(),inplace=True)

In [None]:
X = train.drop(['Id','SalePrice','BsmtQual','BsmtCond','ExterQual','ExterCond','KitchenQual',
                'LandSlope','Functional','HeatingQC','GarageQual','GarageCond','PoolQC','FireplaceQu',
                'MiscFeature','GarageArea'],axis=1,errors='ignore') 
y = train['SalePrice']
X_test = test.drop(['Id','SalePrice','BsmtQual','BsmtCond', 'ExterQual','ExterCond','KitchenQual',
                'LandSlope','Functional','HeatingQC','GarageQual','GarageCond','PoolQC','FireplaceQu',
                   'MiscFeature','GarageArea'],axis=1,errors='ignore')

In [None]:
def _get_xgb_feat_importances(clf, feature_names):

        try:
            # Handles case when clf has been created by calling
            # xgb.XGBClassifier.fit() or xgb.XGBRegressor().fit()
            fscore = clf.get_booster().get_fscore()
        except:
            # Handles case when clf has been created by calling xgb.train.
            # Thus, clf is an instance of xgb.Booster.
            fscore = clf.get_fscore()

        trained_feature_names = feature_names #self._get_trained_feature_names()

        feat_importances = []
        
        #print(fscore.items())

        features = {}
        
        for k, v in fscore.items():
            features[k] = v
            
        return sorted(features.items(), key=lambda x: x[1])[-50:]

In [None]:
model = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,#2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

#pipeline = Pipeline([('imputer', Imputer(strategy='mean')), ('classify', model)])
pipeline = Pipeline([('classify', model)])
pipeline.fit(X,y)
score = rmse_cv(pipeline, X, y)
print("\pipeline score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

preds = pipeline.predict(X_test)

predicted = pd.DataFrame()
predicted['Id'] = test['Id']
predicted['SalePrice'] = preds
predicted[['Id', 'SalePrice']] = predicted[['Id', 'SalePrice']]
predicted.to_csv(root + '/submission.csv', index=False,quoting=csv.QUOTE_NONNUMERIC)

predicted.head()
#last one: \pipeline score: 0.1222 (0.0088) only basement quality ordinal
#\pipeline score: 0.1233 (0.0110) - exterqual, extercond ordinal


In [None]:
str(list(_get_xgb_feat_importances(model, X.columns.tolist())))

In [None]:
# ordinal: OverallQual, OverallCond
# nan: MasVnrType, MasVnrArea, BsmtQual, BsmtCond, BsmtExposure, Electrical, FireplaceQu
# numeric: MasVnrArea, BsmtFinSF1, BsmtFinSF2, 'BsmtUnfSF', 'TotalBsmtSF','1stFlrSF', 
#         '2ndFlrSF', 'LowQualFinSF', 'GrLivArea'