In [30]:
import numpy as np 
import pandas as pd 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

# Additional libs.
import warnings
warnings.filterwarnings(action = 'ignore')
import os
print(os.listdir("."))

['sample_submission.csv', 'HousePricesPrediction.ipynb', '.ipynb_checkpoints', 'data_description.txt', 'train.csv', 'test.csv', 'submission.csv']


In [31]:
house_train_dataset = pd.read_csv('./train.csv', index_col="Id")
X_test_full = pd.read_csv('./test.csv', index_col="Id")

y = house_train_dataset.SalePrice
X_full = house_train_dataset.drop('SalePrice', axis=1)

In [32]:
X_full.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [33]:
num_columns = [col for col in X_full.columns if X_full[col].dtype in ['int32', 'int64', 'float32', 'float64']]
low_cardinality_columns = [col for col in X_full.columns if X_full[col].nunique() <= 10]

features = num_columns + low_cardinality_columns

X = X_full[features]
X_test = X_test_full[features]

In [34]:
numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[("num", numerical_transformer, num_columns),
                                               ("cat", categorical_transformer, low_cardinality_columns)])

# model = RandomForestRegressor(n_estimators=100, random_state=0)
model = XGBRegressor(n_estimators=500, early_stopping_rounds=6, n_jobs=4)

In [35]:
my_pipeline = Pipeline(steps = [('preprocessor', preprocessor), ('model', model)])

In [36]:
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
print("Average MAE score across all experiments: ", scores.mean())

Average MAE score across all experiments:  16188.401752461472


In [37]:
my_pipeline.fit(X, y)
test_preds = my_pipeline.predict(X_test)
output = pd.DataFrame({"Id": X_test.index, "SalePrice": test_preds})
output.to_csv('submission.csv', index=False)

