In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
train = pd.read_csv('documents/kaggle/train.csv')

test = pd.read_csv('documents/kaggle/test.csv')

In [3]:
train.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [4]:
y_train = train['SalePrice']
X_train = train.drop(['Id', 'SalePrice'], axis=1)

id_test = test['Id']
X_test = test.drop(['Id'], axis=1)

In [5]:
# one-hot encode the categorical features
cat_attribs = ['MSZoning'
               , 'Street'
               , 'Alley'
               , 'LotShape'
               , 'LandContour'
               , 'Utilities'
               , 'LotConfig'
               , 'LandSlope'
               , 'Neighborhood'
               , 'Condition1'
               , 'Condition2'
               , 'BldgType'
               , 'HouseStyle'
               , 'RoofStyle'
               , 'RoofMatl'
               , 'Exterior1st'
               , 'Exterior2nd'
               , 'MasVnrType'
               , 'ExterQual'
               , 'ExterCond'
               , 'Foundation'
               , 'BsmtQual'
               , 'BsmtCond'
               , 'BsmtExposure'
               , 'BsmtFinType1'
               , 'BsmtFinType2'
               , 'Heating'
               , 'HeatingQC'
               , 'CentralAir'
               , 'Electrical'
               , 'KitchenQual'
               , 'Functional'
               , 'FireplaceQu'
               , 'GarageType'
               , 'GarageFinish'
               , 'GarageQual'
               , 'GarageCond'
               , 'PavedDrive'
               , 'PoolQC'
               , 'Fence'
               , 'MiscFeature'
               , 'SaleType'
               , 'SaleCondition']

full_pipeline = ColumnTransformer([('cat', 
                                    OneHotEncoder(handle_unknown='ignore'), 
                                    cat_attribs)], 
                                  remainder='passthrough')

encoder = full_pipeline.fit(X_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [6]:
mod = xgb.XGBRegressor(
    gamma=1,                 
    learning_rate=0.01,
    max_depth=5,
    n_estimators=10000,                                                                    
    subsample=0.8,
    random_state=684
) 

In [7]:
mod.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=10000, n_jobs=8, num_parallel_tree=1,
             random_state=684, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.8, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [8]:
predictions = mod.predict(X_test)

In [9]:
predictions_df = pd.DataFrame(predictions)

In [10]:
predictions_final = pd.concat([id_test,
                               predictions_df.reset_index(
                                   drop=True).rename(
                                   columns = {'0' : 'SalePrice'})],
                              axis=1)

In [11]:
predictions_final.head(3)

Unnamed: 0,Id,0
0,1461,126232.742188
1,1462,160070.734375
2,1463,185691.34375


In [12]:
predictions_final.to_csv('documents/kaggle/submission.csv')