# Building a first model for the House Price Prediction

First we will import the necessary.

In [15]:
import pandas as pd
import awswrangler as wr
import boto3
import numpy as np

from housing.jobs.prepare import fill_missing_string_values_none, fill_missing_float_values_zero, fill_missing_int_values_zero, fill_columns_most_common_value, fill_missing_values_functional_column, add_feature_total_area,convert_categorical_values_to_dummies, remove_outliers, log_transform_saleprice, label_encode_categorical_values, fill_missing_lot_frontage, correct_skewed_numerical_features 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from scipy.stats import skew
from scipy.special import boxcox1p
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor

## Feature Engineering

We will start by using only a limited subset of features based on our exploration:

In [2]:
boto3.setup_default_session()
#ssm = boto3.client('ssm')
#parameter = ssm.get_parameter(Name='/datafy-samples/bucket/name')
#bucket = parameter['Parameter']['Value']
bucket='titanic-samples-uflwdn'
df = wr.s3.read_csv(path=f"s3://{bucket}/housing/raw/train.csv")

fill_missing_string_values_none(df)
fill_missing_float_values_zero(df)
fill_missing_int_values_zero(df)
fill_columns_most_common_value(df)
fill_missing_values_functional_column(df)
fill_missing_lot_frontage(df)

add_feature_total_area(df)
df = df.drop(
        ['Utilities', 'MSZoning', 'LandContour','LotConfig','Neighborhood','Condition1','Condition2','BldgType',
        'HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation', 'Heating',
        'Electrical', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition'], axis=1
    )

df = log_transform_saleprice(df)
label_encode_categorical_values(df)
df = convert_categorical_values_to_dummies(df)
correct_skewed_numerical_features(df)
    
df

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,...,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal,MoSold,YrSold,SalePrice,TotalSF
0,0.730463,2.055642,5.831328,19.212182,0.730463,0.730463,1.540963,0.0,2.440268,1.820334,...,0.0,0.0,0.0,1.540963,1.820334,0.000000,0.730463,1.194318,35.190995,14.976591
1,1.194318,0.000000,6.221214,19.712205,0.730463,0.730463,1.540963,0.0,2.259674,2.440268,...,0.0,0.0,0.0,1.540963,1.820334,0.000000,1.820334,0.730463,34.329249,14.923100
2,1.540963,2.055642,5.914940,20.347241,0.730463,0.730463,0.000000,0.0,2.440268,1.820334,...,0.0,0.0,0.0,1.540963,1.820334,0.000000,2.602594,1.194318,35.629466,15.149678
3,1.820334,2.259674,5.684507,19.691553,0.730463,0.730463,0.000000,0.0,2.440268,1.820334,...,0.0,0.0,0.0,1.540963,1.820334,0.000000,0.730463,0.000000,32.763482,14.857121
4,2.055642,2.055642,6.314735,21.325160,0.730463,0.730463,0.000000,0.0,2.602594,1.820334,...,0.0,0.0,0.0,1.540963,1.820334,0.000000,3.011340,1.194318,36.346360,15.852312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,13.213850,2.055642,5.744420,18.960528,0.730463,0.730463,1.540963,0.0,2.259674,1.820334,...,0.0,0.0,0.0,1.540963,1.820334,0.000000,2.440268,0.730463,34.105597,15.019350
1456,13.215896,0.000000,6.337529,20.994868,0.730463,0.730463,1.540963,0.0,2.259674,2.055642,...,0.0,0.0,0.0,1.540963,1.194318,0.000000,0.730463,1.820334,35.236027,16.118017
1457,13.217941,2.259674,5.859551,19.476345,0.730463,0.730463,1.540963,0.0,2.440268,2.602594,...,0.0,0.0,0.0,1.540963,0.000000,14.892193,1.820334,1.820334,36.760707,16.000045
1458,13.219985,0.000000,5.914940,19.760176,0.730463,0.730463,1.540963,0.0,2.055642,2.055642,...,0.0,0.0,0.0,1.540963,1.820334,0.000000,1.540963,1.820334,32.852681,14.418948


We have our dataset and separate the target variable from the dataset.

In [3]:
target = 'SalePrice'
features = [ col for col in df.columns if col != target ]
X, y = df[features], df[target]

Now its time to initialize our model. After initializing I am calculating the feature importances predicted by each model.

Note that I am passing a same random state to all the models. But why?

Using this seed parameter makes sure that anyone who re-runs your code will get the exact same outputs which is extremely important concept in data science.

In [7]:
reg = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                       max_depth=3, max_features='sqrt',
                                       min_samples_leaf=15, min_samples_split=10,
                                       loss='huber', random_state =5)
reg.fit(X,y)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_features='sqrt',
                          min_samples_leaf=15, min_samples_split=10,
                          n_estimators=3000, random_state=5)

Now we take a look at the feature importance calculated.

In [10]:
feature_df = pd.DataFrame(reg.feature_importances_)
feature_df.insert(0, 'features', features)

feature_df

Unnamed: 0,features,0
0,Id,0.002196
1,MSSubClass,0.002108
2,LotFrontage,0.007704
3,LotArea,0.021092
4,Street,0.0
5,Alley,0.000375
6,LotShape,0.000562
7,LandSlope,0.00015
8,OverallQual,0.120331
9,OverallCond,0.015635


From the above data, we can see that GrLivArea, TotalSF and YearBuilt and OveralQual play an important role in predicting the target SalePrice.

## Model training and evaluation

Now we import train_test_split from sklearn package for splitting the data into train and test sets.

Here I am taking 20% of the data for testing and the rest 80% for training.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
reg = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                       max_depth=3, max_features='sqrt',
                                       min_samples_leaf=15, min_samples_split=10,
                                       loss='huber', random_state =5)
reg.fit(X_train,y_train)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_features='sqrt',
                          min_samples_leaf=15, min_samples_split=10,
                          n_estimators=3000, random_state=5)

In [18]:
pred = reg.predict(X_test)
pred

array([33.03975472, 32.5003107 , 34.88040008, 29.37667907, 33.77593672,
       30.23280641, 30.56508925, 31.6377975 , 32.97180169, 38.89953958,
       30.52522376, 32.84299754, 33.2485337 , 31.85482195, 30.3058821 ,
       36.55903226, 29.70823219, 35.31508495, 30.02320518, 32.64316392,
       33.84719372, 36.95558894, 36.68110949, 34.64823367, 34.06593533,
       31.5813126 , 35.24517709, 40.60714675, 33.08379319, 34.85122015,
       32.39292311, 37.30024412, 40.2756697 , 31.75757856, 33.95328905,
       33.43554152, 30.00102892, 32.63986921, 33.76330793, 36.62436495,
       34.93427786, 32.65477602, 34.59816272, 34.89193006, 34.54515298,
       33.34475066, 33.06722737, 32.51977891, 31.83346972, 30.56715628,
       35.40620689, 36.67510869, 32.97293404, 32.59942669, 31.20407521,
       37.09500579, 33.75674607, 34.19568004, 35.83067187, 32.62401292,
       32.75293036, 34.73018345, 34.0939499 , 33.50015915, 31.10115869,
       32.74787756, 30.48517192, 32.4813332 , 33.22573885, 33.19

Now we fit the data and then compare our predictions with actual data to get accuracy.

In [20]:
score = np.sqrt(-cross_val_score(reg, X_test.values, y_test.values, scoring="neg_mean_squared_error"))
score.mean()
score.std()

0.11314303544208815

We get an accuracy of 79.32% which isn’t great. But that’s okay. Progress happens in small increments. Let's stop the experiment and save the data.