In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


        # Any results you write to the current directory are saved as output.

/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/test.csv
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/test.csv.gz


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split


In [3]:
### reading up the data

X_full = pd.read_csv('../input/home-data-for-ml-course/train.csv')
X_test_full = pd.read_csv('../input/home-data-for-ml-course/test.csv')


In [4]:
## our target is to predict SalePrice of the houses
## so let's first preprocess our data

##  remove rows with missing target value 

X_full.dropna(axis=0,subset=['SalePrice'],inplace=True)
y = X_full.SalePrice # storing our target column in y

## now drop our target from X_full

X_full.drop(['SalePrice'],axis=1,inplace=True)



In [5]:
## split th data

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)

categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 618 to 684
Data columns (total 77 columns):
MSZoning         1168 non-null object
Street           1168 non-null object
Alley            71 non-null object
LotShape         1168 non-null object
LandContour      1168 non-null object
Utilities        1168 non-null object
LotConfig        1168 non-null object
LandSlope        1168 non-null object
Condition1       1168 non-null object
Condition2       1168 non-null object
BldgType         1168 non-null object
HouseStyle       1168 non-null object
RoofStyle        1168 non-null object
RoofMatl         1168 non-null object
MasVnrType       1162 non-null object
ExterQual        1168 non-null object
ExterCond        1168 non-null object
Foundation       1168 non-null object
BsmtQual         1140 non-null object
BsmtCond         1140 non-null object
BsmtExposure     1140 non-null object
BsmtFinType1     1140 non-null object
BsmtFinType2     1139 non-null object
Heating          116

In [7]:
X_train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1168.0,1168.0,956.0,1168.0,1168.0,1168.0,1168.0,1168.0,1162.0,1168.0,...,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,738.685788,56.605308,69.614017,10589.672945,6.086473,5.572774,1970.890411,1984.692637,103.481067,439.890411,...,473.632705,94.498288,48.044521,23.02226,3.218322,14.528253,2.118151,50.936644,6.30137,2007.819349
std,421.609683,42.172322,22.946069,10704.180793,1.367472,1.116908,30.407486,20.684612,182.676225,435.106803,...,209.44232,127.312017,68.619199,63.153093,27.916593,54.009608,36.482294,550.380636,2.725977,1.335971
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,373.75,20.0,59.0,7589.5,5.0,5.0,1953.75,1966.0,0.0,0.0,...,336.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,749.5,50.0,69.0,9512.5,6.0,5.0,1972.0,1993.0,0.0,379.5,...,477.5,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,1108.75,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,167.75,716.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,2260.0,...,1390.0,736.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [8]:
X_train.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,774,0,108,0,0,260,0,0,7,2007
870,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,...,308,0,0,0,0,0,0,0,8,2009
92,RL,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,...,432,0,0,44,0,0,0,0,8,2009
817,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,...,857,150,59,0,0,0,0,0,7,2008
302,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,843,468,81,0,0,0,0,0,1,2006


In [9]:
## all the cols with missing values in X_train, which we need to handle
print([col for col in X_train.columns if X_train[col].isnull().any()])


['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt']


In [10]:
print(categorical_cols)
print(numerical_cols)


['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'P

In [20]:
### we will now preprocess our data using pipline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet


scaler = StandardScaler()

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline([
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore',sparse=False))
])


preprocessor = ColumnTransformer(transformers=[
    ('num',numerical_transformer,numerical_cols),
    ('cat',categorical_transformer,categorical_cols)
])


model_rf = RandomForestRegressor(n_estimators=100,random_state=0)
model_en = ElasticNet(random_state=0)
model_gb = XGBRegressor(n_estimators=200,learning_rate=0.05,random_state=0)


my_pipeline = Pipeline(steps=[
    ('pre',preprocessor),
    ('sc',scaler),
    ('gb',model_gb)
])

parameters = {'gb__n_estimators':[i for i in range(100,1000,100)]}







In [21]:
cv = GridSearchCV(my_pipeline,parameters,cv=5)

In [13]:
cv.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('pre',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                                       fill_value=None,
                                                                                       missing_values=nan,
                                                               

In [14]:
preds = cv.predict(X_valid)

In [15]:
mae = mean_absolute_error(y_valid,preds)
mae

16879.00852151113

In [16]:
cv.best_params_

{'gb__n_estimators': 700}

In [17]:
## for final submission we need to make sure that our test data is also preprocessed


test_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('sc',scaler)
    #,('model',model)
])


test_pipeline.fit(X_test)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'Ove

In [18]:
preds_test = cv.predict(X_test)


In [19]:
output = pd.DataFrame({'Id': X_test.Id,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)