# House Price Prediction 

Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, our goal is to predict the price of each house.

# Basic Data Analysis Library

In [37]:
# Import necessary python libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import norm
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Data preprocessing

In [38]:
# Loading  data
def load_data(df):
    return pd.read_csv(df)

#Storing Id
def store_id(df):
    return df['Id']

# Dropping Id
def drop_id(df):
    return df.drop("Id", axis = 1, inplace = True)

# merging the data 
def full_data(df1,df2):
    return pd.concat(objs=[df1, df2], axis=0).reset_index(drop=True)

# Imputing missing value
def miss_val(df):
    for col in miss_col1:
        if df[col].dtype=='O':
            df[col]=df[col].fillna("None")
        else:
            df[col]=df[col].fillna(0)
    
    df['LotFrontage']=df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    df['Electrical']=df['Electrical'].fillna(df['Electrical'].mode()[0]) 
    
    return df

# creating features
def new_features(df):
    df['TotSF']=df['TotalBsmtSF']+df['1stFlrSF']+df['2ndFlrSF']
    df['TotArea']=df['GarageArea']+df['GrLivArea']
    
    return df

#numerical to categorical col, some columns are numerical but actually they are categorical
def num_to_cat(df):
    for col in cols:
        df[col] = df[col].apply(str)
    return df


# label Encoding for ordinal data
from sklearn.preprocessing import LabelEncoder
label_encode=LabelEncoder()

def convert_data(df):
    for col in ordinal_cat:
        df[col]=label_encode.fit_transform(df[col])
    return pd.get_dummies(df)

# Defining data

In [39]:
# Train and Test data
df_train=load_data('train.csv')
df_test=load_data('test.csv')

# store id
id_train= df_train['Id']
Id_test = df_test['Id']

# Droping Id
drop_id(df_train)
drop_id(df_test)

# remove suspicious point
df_train=df_train[df_train['SalePrice']<700000]

# Log transformation
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])

# Removing suspicious outliers
df_train = df_train.drop(df_train[(df_train['GrLivArea']>4000) & (df_train['SalePrice']<300000)].index).reset_index(drop=True)
df_train=df_train.drop(df_train[(df_train['LotFrontage']>250) & (df_train['SalePrice']<300000)].index).reset_index(drop=True)
df_train=df_train.drop(df_train[(df_train['BsmtFinSF1']>1400) & (df_train['SalePrice']<400000)].index).reset_index(drop=True)
df_train=df_train.drop(df_train[(df_train['TotalBsmtSF']>5000) & (df_train['SalePrice']<300000)].index).reset_index(drop=True)
df_train=df_train.drop(df_train[(df_train['1stFlrSF']>4000) & (df_train['SalePrice']<300000)].index).reset_index(drop=True)

# Length of the training data
train_len = len(df_train)

# Merging train and test data
data=full_data(df_train, df_test)


# Filling missing value
miss_col1=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageType', 'GarageYrBlt',
           'GarageFinish', 'GarageQual', 'BsmtExposure','BsmtFinType2', 'BsmtFinType1', 'BsmtCond', 'BsmtQual', 
           'MasVnrArea', 'MasVnrType','SaleType','MSZoning','Utilities','Functional','Exterior1st','Exterior2nd',
           'BsmtFinSF1','BsmtFinSF2','TotalBsmtSF','GarageArea','KitchenQual','GarageCars','BsmtFullBath',
           'BsmtHalfBath','BsmtUnfSF']

# Remving missing value
clean_data=miss_val(data)

# Creating new features
clean_data=new_features(clean_data)

# Columns that are numerical but actually they are categorical
cols=['MSSubClass','OverallCond','YrSold','MoSold']
clean_data=num_to_cat(clean_data)


# Ordinal features
ordinal_cat=['OverallCond','KitchenQual','YrSold','MoSold','Fence','PoolQC','FireplaceQu','GarageQual', 
             'GarageCond','LotShape','LandSlope','HouseStyle','ExterQual','ExterCond','BsmtQual', 
             'BsmtCond','BsmtExposure','BsmtFinType1', 'BsmtFinType2','HeatingQC','KitchenQual','CentralAir',
             'MSSubClass']

# converting categorical to numerical features
clean_data=convert_data(clean_data)

# Preparing data
df_target=clean_data['SalePrice']
df_feature=clean_data.drop(columns=['SalePrice'])
X_train=df_feature[:train_len]
Y_train=df_target[:train_len]
X_test=df_feature[train_len:]

# Model building

Import Machine Learning Libraries

In [40]:
# Import MLlibraries
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.utils import shuffle

from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso, Ridge, BayesianRidge, LassoLarsIC

In [41]:
# Basic models
Basic_model=[
    Lasso(),
    RandomForestRegressor(),
    ElasticNet(),
    KernelRidge(),
    GradientBoostingRegressor()
]

# Root Mean Square Error

In [42]:
# Function to calculate root mean squared error with default parameters
n_folds = 5 # five fold cross validation
def rmse_cv(models):
    
    model_name=[]
    RMSE=[]
    for model in models:
        kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train)
        rmse= np.sqrt(-cross_val_score(model, X_train, Y_train, scoring="neg_mean_squared_error", cv = kf))
        mean_RMSE=np.mean(rmse)
        
        model_name.append(model.__class__.__name__)
        RMSE.append(mean_RMSE)
    
    result=pd.DataFrame({'Model':model_name,'RMSE':RMSE})
    
    result=result.sort_values(by='RMSE',ascending=True).reset_index()
    print("\nResult:\n", result)
    
    min_RMSE=min(result['RMSE'])
    best_model=result['Model'][0]# first row and first column
    
    print('The minimum RMSE is {} and the best model is {}'.format(min_RMSE,best_model))
        
rmse_cv(Basic_model)


Result:
    index                      Model      RMSE
0      3                KernelRidge  0.120578
1      4  GradientBoostingRegressor  0.122217
2      1      RandomForestRegressor  0.143142
3      2                 ElasticNet  0.161444
4      0                      Lasso  0.169176
The minimum RMSE is 0.12057848837200343 and the best model is KernelRidge


In [43]:
# Grid search Cv method to find the best parameters
from sklearn.model_selection import GridSearchCV
Lasso_params = {'alpha': [0.0001,0.0002,0.0003,0.0004,0.0005,0.0006]}
RandomForest_params = {'n_estimators': list(range(50, 200, 25)), 'max_features': ['auto', 'sqrt', 'log2'], 
         'min_samples_leaf': list(range(50, 200, 50))}
ElasticNet_params = {'alpha': [0.0001,0.0002,0.0003,0.0004,0.0005,0.0006]}
Ridge_params = {'alpha': [0.0001,0.0002,0.0003,0.0004,0.0005,0.0006]}
Grad_boosting_params = {'n_estimators': [1000,2000,3000,4000,5000,6000]}

# Grid Search for Lasso 
grid_search_cv = GridSearchCV(Lasso(random_state=42), Lasso_params, n_jobs=-1)
grid_search_cv.fit(X_train, Y_train)

print('Print best estimator for lasso:', grid_search_cv.best_estimator_)

# Grid Search for RandomForest

grid_search_cv = GridSearchCV(RandomForestRegressor(random_state=42), RandomForest_params, n_jobs=-1)
grid_search_cv.fit(X_train, Y_train)

print('Print best estimator for RandomForest:', grid_search_cv.best_estimator_)

# Grid Search for ElasticNet
grid_search_cv = GridSearchCV(ElasticNet(random_state=42), ElasticNet_params, n_jobs=-1)
grid_search_cv.fit(X_train, Y_train)

print('Print best estimator for ElasticNet:', grid_search_cv.best_estimator_)

# Grid Search for KernelRidge
grid_search_cv = GridSearchCV(KernelRidge(), Ridge_params, n_jobs=-1)
grid_search_cv.fit(X_train, Y_train)

print('Print best estimator for KernelRidge:', grid_search_cv.best_estimator_)

# Grid Search for GradientBoosting Regressor
grid_search_cv = GridSearchCV(GradientBoostingRegressor(), Grad_boosting_params, n_jobs=-1)
grid_search_cv.fit(X_train, Y_train)

print('Print best estimator for GradientBoostingRegressor:', grid_search_cv.best_estimator_)

Print best estimator for lasso: Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False)
Print best estimator for RandomForest: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
Print best estimator for ElasticNet: ElasticNet(alpha=0.0006, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)
Print best estimator for KernelRidge: KernelRidge(alpha=0.0004, coef0=1, degree=3, gamma=None, kerne

# RMSE with best parameters

In [44]:
# Function to calculate root mean squared error with best parameters
model_best_params=[
    Lasso(alpha =0.0005, random_state=42),
    RandomForestRegressor(min_samples_leaf=50, min_samples_split=2, n_estimators=150, random_state=42),
    ElasticNet(alpha=0.0006, l1_ratio=.5, random_state=42),
    KernelRidge(alpha=0.004, kernel='linear', degree=3, coef0=1.0),
    GradientBoostingRegressor(alpha=0.9, n_estimators=1000, learning_rate=0.1,
                                   max_depth=3, max_features='sqrt',
                                   min_samples_leaf=1, min_samples_split=2, 
                                   loss='huber', random_state =42)
]

def rmse_cv(models):
    
    model_name=[]
    RMSE=[]
    for model in models:
        kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train)
        rmse= np.sqrt(-cross_val_score(model, X_train, Y_train, scoring="neg_mean_squared_error", cv = kf))
        mean_RMSE=np.mean(rmse)
        
        model_name.append(model.__class__.__name__)
        RMSE.append(mean_RMSE)
    
    result=pd.DataFrame({'Model':model_name,'RMSE':RMSE})
    
    result=result.sort_values(by='RMSE',ascending=True).reset_index()
    print("\nResult:\n", result)
    
    min_RMSE=min(result['RMSE'])
    best_model=result['Model'][0]# first row and first column
    
    print('The best model and RMSE for best parameter are {} and {} respectively'.format(best_model, min_RMSE))
        
rmse_cv(model_best_params)


Result:
    index                      Model      RMSE
0      0                      Lasso  0.113286
1      2                 ElasticNet  0.113730
2      4  GradientBoostingRegressor  0.115949
3      3                KernelRidge  0.129147
4      1      RandomForestRegressor  0.174109
The best model and RMSE for best parameter are Lasso and 0.11328604728238068 respectively


In [45]:
# prediction with lasso
lasso_fit= Lasso(alpha =0.0005, random_state=42).fit(X_train,Y_train)
y_pred=np.expm1(lasso_fit.predict(X_test))
test_prediction = pd.Series(y_pred, name="SalePrice")

Final_prediction= pd.concat([Id_test,test_prediction],axis=1)
Final_prediction.to_csv("test_score.csv", index=False)
Final_result.head()

The End