In [194]:
#import all necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
import xgboost as XGB

In [157]:
!pip install --upgrade sklearn


Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
Using legacy 'setup.py install' for sklearn, since package 'wheel' is not installed.
Installing collected packages: sklearn
  Running setup.py install for sklearn ... [?25ldone
[?25hSuccessfully installed sklearn-0.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [6]:
#import train test dataset
train=pd.read_csv('housing_train.csv')
X_test=pd.read_csv('housing_test.csv')

In [8]:
X=train.drop("SalePrice",axis=1)
y=train.SalePrice
X_train,X_valid,y_train,y_valid=train_test_split(X,y,test_size=0.3,random_state=1432)

In [13]:
X_train.describe().T.iloc[:10]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1022.0,724.247554,422.880003,1.0,359.25,725.5,1094.5,1460.0
MSSubClass,1022.0,56.575342,41.751723,20.0,20.0,50.0,70.0,190.0
LotFrontage,845.0,70.494675,25.170349,21.0,60.0,69.0,80.0,313.0
LotArea,1022.0,10179.463796,7022.041771,1477.0,7663.75,9525.0,11477.25,159000.0
OverallQual,1022.0,6.099804,1.396028,1.0,5.0,6.0,7.0,10.0
OverallCond,1022.0,5.561644,1.123002,1.0,5.0,5.0,6.0,9.0
YearBuilt,1022.0,1971.307241,30.545185,1875.0,1954.0,1973.0,2001.0,2009.0
YearRemodAdd,1022.0,1985.526419,20.439222,1950.0,1968.0,1994.0,2004.0,2010.0
MasVnrArea,1015.0,107.17931,181.032017,0.0,0.0,0.0,171.5,1600.0
BsmtFinSF1,1022.0,444.810176,466.656741,0.0,0.0,383.5,705.75,5644.0


In [29]:
X_train.describe(include=object).T.iloc[:10]

Unnamed: 0,count,unique,top,freq
MSZoning,1022,5,RL,809
Street,1022,2,Pave,1018
Alley,70,2,Grvl,37
LotShape,1022,4,Reg,653
LandContour,1022,4,Lvl,921
Utilities,1022,2,AllPub,1021
LotConfig,1022,5,Inside,718
LandSlope,1022,3,Gtl,966
Neighborhood,1022,24,NAmes,158
Condition1,1022,9,Norm,880


In [36]:
#checks for total null values in each feature
X_train.isnull().sum()[X_train.isnull().sum()>0]

LotFrontage      177
Alley            952
MasVnrType         7
MasVnrArea         7
BsmtQual          25
BsmtCond          25
BsmtExposure      26
BsmtFinType1      25
BsmtFinType2      26
Electrical         1
FireplaceQu      485
GarageType        62
GarageYrBlt       62
GarageFinish      62
GarageQual        62
GarageCond        62
PoolQC          1016
Fence            823
MiscFeature      979
dtype: int64

In [45]:
#We have 19 features with null values
X_train.isnull().sum()[X_train.isnull().sum()>0].count()

19

In [89]:
#numerical features
numeric_features=X_train.select_dtypes(include='number').columns.tolist()
print(f'There are {len(numerical_features)} numerical features: \n {numeric_features} \n')

#Categorical features
non_numeric_features=X_train.select_dtypes(exclude='number').columns.tolist()
print(f'There are {len(non_numerical_features)} non_numerical features: \n {non_numeric_features}')

There are 37 numerical features: 
 ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'] 

There are 43 non_numerical features: 
 ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', '

In [75]:
#Using SimpleImputer to fill missing values
#Minmax scaler to normalize numeric data
#onehoeencoder to encode non numeric data
numeric_pipeline=Pipeline(
    steps=
    [('impute',SimpleImputer(strategy='mean')),
     ('scale',MinMaxScaler())])
non_numeric_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='most_frequent')),
          ('onehot',OneHotEncoder(handle_unknown='ignore',sparse='False'))])


In [79]:
numeric_pipeline.fit_transform(X_train.select_dtypes(include='number'))

array([[0.68471556, 0.        , 0.14726027, ..., 0.        , 0.09090909,
        1.        ],
       [0.81768334, 0.58823529, 0.16950231, ..., 0.        , 0.45454545,
        0.75      ],
       [0.34338588, 0.23529412, 0.18493151, ..., 0.        , 0.09090909,
        0.5       ],
       ...,
       [0.60383825, 0.17647059, 0.07876712, ..., 0.        , 0.27272727,
        0.25      ],
       [0.16929404, 0.        , 0.18493151, ..., 0.        , 0.45454545,
        0.        ],
       [0.46264565, 0.82352941, 0.01027397, ..., 0.        , 0.27272727,
        0.75      ]])

In [92]:
#we are going to use column transformer to combine both numerical and non numerical data 
#And use fit_transform to get array like data

full_processor=ColumnTransformer(
    transformers=[('number',numeric_pipeline,numeric_features),
                  ('non_numeric',non_numeric_pipeline,non_numeric_features)])

In [95]:
full_processor.fit_transform(X_train)

<1022x287 sparse matrix of type '<class 'numpy.float64'>'
	with 68822 stored elements in Compressed Sparse Row format>

In [103]:
//We are using lasso model to train 
lasso=Lasso(alpha=0.1)
lasso_pipeline=Pipeline(steps=[('preprocessor',full_processor),('model',lasso)])

In [104]:
lasso_pipeline.fit(X_train,y_train)

  model = cd_fast.sparse_enet_coordinate_descent(


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('number',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   MinMaxScaler())]),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                       

In [138]:
predict=lasso_pipeline.predict(X_valid)

In [139]:
mean_absolute_error(y_valid,predict)

16569.590845266077

In [159]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
MAPE(y_valid,predict)

9.579150244615624

In [140]:
lasso_pipeline.score(X_valid,y_valid)

0.9034760093296286

In [160]:
y_valid

1392    123000
1085    147000
786     139000
884     100000
1155    218000
         ...  
1044    278000
910     154300
661     402000
463     188700
118     320000
Name: SalePrice, Length: 438, dtype: int64

In [162]:
#lets use gridseach to find optimal hyperparameters for the model
param_dict={'model__alpha':np.arange(0,100,5)}

search=GridSearchCV(
    lasso_pipeline,
    param_dict,
    scoring="neg_mean_absolute_error")


In [163]:
search.fit(X_train,y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.sparse_enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.sparse_enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.sparse_enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.sparse_enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('number',
                                                                         Pipeline(steps=[('impute',
                                                                                          SimpleImputer()),
                                                                                         ('scale',
                                                                                          MinMaxScaler())]),
                                                                         ['Id',
                                                                          'MSSubClass',
                                                                          'LotFrontage',
                                                                          'LotArea',
                                                                          'OverallQual',
              

In [178]:
print(f'best score {abs(search.best_score_)}')

best score 18182.84650484882


In [179]:
print(f'best params {search.best_params_}')

best params {'model__alpha': 90}


In [180]:
lasso=Lasso(alpha=0.90)
lasso_pipeline=Pipeline(steps=[('preprocessor',full_processor),('model',lasso)])

In [181]:
lasso_pipeline.fit(X_train,y_train)

  model = cd_fast.sparse_enet_coordinate_descent(


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('number',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   MinMaxScaler())]),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'OverallCond',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'MasVnrArea', 'BsmtFinSF1',
                                                   'BsmtFinSF2', 'BsmtUnfSF',
                                                   'TotalBsmtSF', '1stFlrSF',
                                       

In [183]:
mean_absolute_error(y_valid,predict)

16471.060895143808

In [184]:
lasso_pipeline.score(X_valid,y_valid)

0.9040468269073632

In [186]:
(predict.sum()/y_valid.sum())*100

99.23763342294906

In [193]:
r2_score(y_valid, predict)

0.9040468269073632

In [197]:
#we are using XGB regressor to train our data
XGBR=XGB.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, random_state =7, nthread = -1)
XGBR_pipeline=Pipeline(steps=[('preprocessor',full_processor),('model',XGBR)])
XGBR_pipeline.fit(X_train,y_train)
XGBR_predict=XGBR_pipeline.predict(X_valid)

In [198]:
XGBR_pipeline.score(X_valid,y_valid)

0.9283436470280353

In [200]:
mean_absolute_error(y_valid,XGBR_predict)

14074.000472674086

In [201]:
#Comaparitively, XGB regressor perform better than Lasso.