## Housing Price Regression Walkthrough
> let start with some goal to achive on this data_set :
1. Understand and clean the data to ensure it is ready for analysis and modeling.
2. Explore dependance, Data analysis 
3. Basic Data Engineering
4. Experiment with various regression models and tune their hyperparameters.
5. Implement cross-validation to ensure the model generalizes well.
6. Feature Engineering 
7. Conduct error analysis to identify and address the model's shortcomings.
8. Ensembling 
9. Submit the model 

In [3]:
%pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import scipy.stats as stats
import os
import seaborn as sns
from IPython.display import display, HTML
SEED = 42

In [5]:
ls = os.path.abspath('H_data_set')

In [6]:
train_df = pd.read_csv(ls + '/train.csv')
test_df = pd.read_csv(ls + '/test.csv')

In [7]:
def scrollable_table(train_df, title,table_id):
    html = f'<h2>{title}</h2>'
    html += f'<div id="{table_id}" style="height:300px; overflow:auto;">'
    html += train_df.to_html()
    html += '</div>'
    return html

In [8]:
df_num = train_df.select_dtypes(include = ['float64', 'int64'])
df_num.describe().T
html_numerical = scrollable_table(df_num.describe().T, 'Numerical Features Summary', 'Summary statistics for numerical features')
display(HTML(html_numerical))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [9]:
df_cat = train_df.select_dtypes(include = ['object'])
df_cat.describe().T
html_numerical = scrollable_table(df_cat.describe().T, 'Categorical Features Summary', 'Summary statistics for categorical features')
display(HTML(html_numerical))

Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


In [10]:
null_values = (train_df.isnull().sum()/ len(train_df)*100)
html_null = scrollable_table(null_values.to_frame(), 'Null Values', 'Null values in the dataset')
display(HTML(html_null))


Unnamed: 0,0
Id,0.0
MSSubClass,0.0
MSZoning,0.0
LotFrontage,17.739726
LotArea,0.0
Street,0.0
Alley,93.767123
LotShape,0.0
LandContour,0.0
Utilities,0.0


In [11]:
hist_data = go.Histogram(x=train_df['SalePrice'], nbinsx=50, name="Histogram",opacity=0.75, histnorm='probability density',marker=dict(color='purple'))
x_norm = np.linspace(train_df['SalePrice'].min(), train_df['SalePrice'].max(), 100)

y_norm = stats.norm.pdf(x_norm, train_df['SalePrice'].mean(), train_df['SalePrice'].std())
norm_data = go.Scatter(x=x_norm, y=y_norm, mode='lines', name='Normal Distribution')
fig = go.Figure(data=[hist_data, norm_data])

fig.update_layout(
    title='Sale Price Distribution',
    xaxis_title='Sale Price', 
    yaxis_title='Frequency',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='white')
)
fig.show()

In [12]:
qq_data=stats.probplot(train_df['SalePrice'], dist="norm")
qq_fig = px.scatter(x=qq_data[0][0], y=qq_data[0][1], labels={'x':'Theoretical Quantiles', 'y':'Ordered Values'}, color_discrete_sequence=['purple'])
qq_fig.update_layout(
    title='Q-Q Plot for Sale Price',
    xaxis_title='Theoretical Quantiles', 
    yaxis_title='Ordered Values',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='white')
)

slope, intercept, r_value, p_value, std_err = stats.linregress(qq_data[0][0], qq_data[0][1])
line_x = np.array([qq_data[0][0].min(), qq_data[0][0].max()])
line_y = intercept + slope * line_x
line_data = go.Scatter(x=line_x, y=line_y, mode='lines', name='Linear Regression', line=dict(color='green', width=2))
qq_fig.add_trace(line_data)
qq_fig.show()

In [13]:
dwellings_type = train_df['BldgType'].value_counts()
dwellings_price = train_df.groupby('BldgType')['SalePrice'].mean()
formatted_price = dwellings_price.apply(lambda x: f'${x:,.2f}')

In [14]:
#Bar char for dwelling type
fig1 = go.Figure(data = [go.Bar(
    x=dwellings_type.index, 
    y=dwellings_type.values, 
    name='Dwellings Type', 
    marker=dict(color='purple'))]
    )
fig1.update_layout(
    title='Dwellings Type Distribution',
    xaxis_title='Dwellings Type', 
    yaxis_title='count',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='white')
)

In [15]:
#Bar char for dwelling price
fig2 = go.Figure(data = [go.Bar(
    x=dwellings_price.index, 
    y=dwellings_price.values, 
    name='Dwellings Price', 
    marker=dict(color='green'))]
    )
fig2.update_layout(
    title='Dwellings Price Distribution',
    xaxis_title='Dwellings Type', 
    yaxis_title='Average Price',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='white')
)

In [16]:
#Zoning impact on price
zoning_price = train_df.groupby('MSZoning')['SalePrice'].mean()
zoning_type = train_df['MSZoning'].value_counts()
formatted_price = zoning_price.apply(lambda x: f'${x:,.2f}')
fig3 = go.Figure(data = [go.Bar(
    x=zoning_price.index, 
    y=zoning_price.values, 
    name='Zoning Price', 
    marker=dict(color='purple'))]
    )
fig3.update_layout(
    title='Zoning Impact on Price',
    xaxis_title='Zoning Type', 
    yaxis_title='Average Price',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='white')
)
fig3.show()

In [17]:
fig4 = go.Figure(data = [go.Bar(
    x=zoning_type.index, 
    y=zoning_type.values, 
    name='Zoning Type', 
    marker=dict(color='green'))]
    )
fig4.update_layout(
    title='Zoning Distribution',
    xaxis_title='Zoning Type', 
    yaxis_title='count',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='white')
)

In [18]:
#Street and alley impact on price 
street_price = train_df.groupby('Street')['SalePrice'].mean()
alley_price = train_df.groupby('Alley')['SalePrice'].mean()
formatted_street_price = street_price.apply(lambda x: f'${x:,.2f}')
formatted_alley_price = alley_price.apply(lambda x: f'${x:,.2f}')

In [19]:
fig5 = go.Figure(data = [go.Bar(
    x=street_price.index, 
    y=street_price.values, 
    name='Street Price', 
    marker=dict(color='purple'))]
    )
fig5.update_layout(
    title='Street Impact on Price',
    xaxis_title='Street Type', 
    yaxis_title='Average Price',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='silver')
)

In [20]:
fig6 = go.Figure(data = [go.Bar(
    x=alley_price.index, 
    y=alley_price.values, 
    name='Alley Price', 
    marker=dict(color='green'))]
    )
fig6.update_layout(
    title='Alley Impact on Price',
    xaxis_title='Alley Type', 
    yaxis_title='Average Price',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='silver')
)

In [21]:
#Average price per property shape
prop_shape_price = train_df.groupby('LotShape')['SalePrice'].mean()
prop_contour_price = train_df.groupby('LandContour')['SalePrice'].mean()
formatted_prop_shape_price = prop_shape_price.apply(lambda x: f'${x:,.2f}')
formatted_prop_contour_price = prop_contour_price.apply(lambda x: f'${x:,.2f}')

In [22]:
fig7 = go.Figure(data = [go.Bar(
    x=prop_shape_price.index, 
    y=prop_shape_price.values, 
    name='Lot Shape Price', 
    marker=dict(color='purple'))]
    )
fig7.update_layout(
    title='Lot Shape Impact on Price',
    xaxis_title='Lot Shape Type', 
    yaxis_title='Average Price',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='silver')
)


In [23]:
fig8 = go.Figure(data=[go.Bar(
    x=prop_contour_price.index, 
    y=prop_contour_price.values, 
    name='Land Contour Price', 
    marker=dict(color='green'))]
    )
fig8.update_layout(
    title='Land Contour Impact on Price',
    xaxis_title='Land Contour Type', 
    yaxis_title='Average Price',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='silver')
)

In [24]:
#property age impact on price
train_df['PropertyAge'] = train_df['YrSold'] - train_df['YearBuilt']
age_price = train_df.groupby('PropertyAge')['SalePrice'].mean()
fig9 = go.Figure(data =[go.Scatter(
    x=age_price.index, 
    y=age_price.values, 
    name='Property Age Price', 
    mode='lines', 
    marker=dict(color='purple'))]
    )
fig9.update_layout(
    title='Property Age Impact on Price',
    xaxis_title='Property Age', 
    yaxis_title='Average Price',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='silver')
)

In [25]:
age_price_corr = train_df['PropertyAge'].corr(train_df['SalePrice'])
print (f'Correlation betwenn Property Age and Sale Price: {age_price_corr}')

Correlation betwenn Property Age and Sale Price: -0.523350417546816


In [26]:
#Living area impact on price
living_area_price = train_df.groupby('GrLivArea')['SalePrice'].mean()
fig10 = go.Figure(data =[go.Scatter(
    x=living_area_price.index, 
    y=living_area_price.values, 
    name='Living Area Price', 
    mode='lines', 
    marker=dict(color='purple'))]
    )
fig10.update_layout(
    title='Living Area Impact on Price',
    xaxis_title='Living Area', 
    yaxis_title='Average Price',
    legend_title='Data Distribution',
    plot_bgcolor='rgba(32,32,32,1)',
    paper_bgcolor='rgba(32,32,32,1)',
    font=dict(color='silver')
)

In [27]:
living_area_price_corr = train_df['GrLivArea'].corr(train_df['SalePrice'])
print (f'Correlation betwenn Living Area and Sale Price: {living_area_price_corr}')

Correlation betwenn Living Area and Sale Price: 0.7086244776126522


### Data pipeline

In [28]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [29]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [30]:
cat_columns = train_df.select_dtypes(include=['object', 'category']).columns
num_columns = train_df.select_dtypes(include=['int64', 'float64']).columns

num_columns = num_columns.drop('SalePrice')
prepocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
        ],remainder='passthrough')
Pipeline = Pipeline(steps=[('preprocessor', prepocessor)])

X= train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']
X_preprocessed = Pipeline.fit_transform(X)

### Model processing 


In [31]:
#Decision tree regressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=SEED)
#Define our models

models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=SEED),
    'RandomForest': RandomForestRegressor(random_state=SEED),
    'XGBoost': XGBRegressor(random_state=SEED)
}
#hyperparameters
params = {
    'LinearRegression': {},
    'DecisionTree': {'max_depth': [3, 5, 7, 9, 11],
                    
                     'min_samples_split': [2, 3, 10, 15],
                     },
    'RandomForest': {'n_estimators': [100, 200, 500], 
                     'max_depth': [3, 5, 7, 9, 11],
                     'min_samples_split': [2, 3, 4, 5]
                     }, 
    'XGBoost': {'n_estimators': [100, 200, 500], 
                'max_depth': [3, 5, 7, 9, 11],
                'learning_rate': [0.01, 0.1, 0.3],
                }
}
# 3-fold cross validation
kf = KFold(n_splits=3, shuffle=True, random_state=SEED)
#train and tune models
grid = {}
for name, model in models.items():
    print(f'Training {name}')
    grid[name] = GridSearchCV(model, param_grid=params[name], cv=kf,n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')
    grid[name].fit(X_train, y_train)
    best_params = grid[name].best_params_
    best_score = np.sqrt(-1*grid[name].best_score_)
    print(f'Best parameters for {name} : {best_params}')
    print(f'Best RMSE for  {name} : {best_score}\n')

Training LinearRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters for LinearRegression : {}
Best RMSE for  LinearRegression : 45167.754032055964

Training DecisionTree
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters for DecisionTree : {'max_depth': 9, 'min_samples_split': 15}
Best RMSE for  DecisionTree : 40119.3405114763

Training RandomForest
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Best parameters for RandomForest : {'max_depth': 11, 'min_samples_split': 3, 'n_estimators': 200}
Best RMSE for  RandomForest : 31920.938143817602

Training XGBoost
Fitting 3 folds for each of 45 candidates, totalling 135 fits
Best parameters for XGBoost : {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Best RMSE for  XGBoost : 29361.722647415674



In [33]:
from sklearn.neural_network import MLPRegressor
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

mlp = MLPRegressor(n_iter_no_change=3, max_iter=10000, random_state=SEED, learning_rate_init=0.01)
params = {
    'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (25)],
    'activation': ['relu', 'tanh', 'logistic'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'solver': ['adam', 'lbfgs'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}

grid_search_mlp = GridSearchCV(mlp, param_grid=params, cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')
grid_search_mlp.fit(X_train_scaled, y_train)

print (f'Best parameters for MLP: {grid_search_mlp.best_params_}')
best_score = np.sqrt(-1*grid_search_mlp.best_score_)
print (f'Best RMSE for MLP: {best_score}')


Fitting 3 folds for each of 288 candidates, totalling 864 fits
Best parameters for MLP: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10, 10, 10), 'learning_rate': 'constant', 'solver': 'adam'}
Best RMSE for MLP: 35173.025796732596


In [34]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
for i in grid.keys():
   print( i + '', np.sqrt(mean_squared_error(y_test, grid[i].predict(X_test))))
   

LinearRegression 65338.546930128585
DecisionTree 38782.54208819472
RandomForest 29405.753817161414
XGBoost 25103.062282150586


In [35]:
print( str(np.sqrt(mean_squared_error(grid_search_mlp.predict(X_test_scaled), y_test))) + ' MLP RMSE')

30617.94015948992 MLP RMSE


In [36]:
var_explore = train_df[['Fence','Alley','MiscFeature','PoolQC','FireplaceQu','GarageCond','GarageQual','GarageFinish','GarageType','BsmtExposure','BsmtFinType2',
                        'BsmtFinType1','BsmtCond','BsmtQual','MasVnrType','Electrical','MSZoning','Utilities','Exterior1st','Exterior2nd','KitchenQual','Functional',
                        'SaleType','LotFrontage','GarageYrBlt','MasVnrArea','BsmtFullBath','BsmtHalfBath','GarageCars','GarageArea','TotalBsmtSF']]
display(HTML(scrollable_table(var_explore, 'var_explore', 'List of variables to explore')))

Unnamed: 0,Fence,Alley,MiscFeature,PoolQC,FireplaceQu,GarageCond,GarageQual,GarageFinish,GarageType,BsmtExposure,BsmtFinType2,BsmtFinType1,BsmtCond,BsmtQual,MasVnrType,Electrical,MSZoning,Utilities,Exterior1st,Exterior2nd,KitchenQual,Functional,SaleType,LotFrontage,GarageYrBlt,MasVnrArea,BsmtFullBath,BsmtHalfBath,GarageCars,GarageArea,TotalBsmtSF
0,,,,,,TA,TA,RFn,Attchd,No,Unf,GLQ,TA,Gd,BrkFace,SBrkr,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD,65.0,2003.0,196.0,1,0,2,548,856
1,,,,,TA,TA,TA,RFn,Attchd,Gd,Unf,ALQ,TA,Gd,,SBrkr,RL,AllPub,MetalSd,MetalSd,TA,Typ,WD,80.0,1976.0,0.0,0,1,2,460,1262
2,,,,,TA,TA,TA,RFn,Attchd,Mn,Unf,GLQ,TA,Gd,BrkFace,SBrkr,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD,68.0,2001.0,162.0,1,0,2,608,920
3,,,,,Gd,TA,TA,Unf,Detchd,No,Unf,ALQ,Gd,TA,,SBrkr,RL,AllPub,Wd Sdng,Wd Shng,Gd,Typ,WD,60.0,1998.0,0.0,1,0,3,642,756
4,,,,,TA,TA,TA,RFn,Attchd,Av,Unf,GLQ,TA,Gd,BrkFace,SBrkr,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD,84.0,2000.0,350.0,1,0,3,836,1145
5,MnPrv,,Shed,,,TA,TA,Unf,Attchd,No,Unf,GLQ,TA,Gd,,SBrkr,RL,AllPub,VinylSd,VinylSd,TA,Typ,WD,85.0,1993.0,0.0,1,0,2,480,796
6,,,,,Gd,TA,TA,RFn,Attchd,Av,Unf,GLQ,TA,Ex,Stone,SBrkr,RL,AllPub,VinylSd,VinylSd,Gd,Typ,WD,75.0,2004.0,186.0,1,0,2,636,1686
7,,,Shed,,TA,TA,TA,RFn,Attchd,Mn,BLQ,ALQ,TA,Gd,Stone,SBrkr,RL,AllPub,HdBoard,HdBoard,TA,Typ,WD,,1973.0,240.0,1,0,2,484,1107
8,,,,,TA,TA,Fa,Unf,Detchd,No,Unf,Unf,TA,TA,,FuseF,RM,AllPub,BrkFace,Wd Shng,TA,Min1,WD,51.0,1931.0,0.0,0,0,2,468,952
9,,,,,TA,TA,Gd,RFn,Attchd,No,Unf,GLQ,TA,TA,,SBrkr,RL,AllPub,MetalSd,MetalSd,TA,Typ,WD,50.0,1939.0,0.0,1,0,1,205,991


In [37]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 82 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [38]:
from sklearn.preprocessing import FunctionTransformer

def custom_features(df):
    df_out = df.copy()
    df_out['PropertyAge'] = df_out['YrSold'] - df_out['YearBuilt']
    df_out['TotalSF'] = df_out['TotalBsmtSF'] + df_out['1stFlrSF'] + df_out['2ndFlrSF']
    df_out['TotalBath'] = df_out['FullBath'] + 0.5 * df_out['HalfBath'] + df_out['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    df_out['HasRemodeled'] = (df_out['YearRemodAdd'] != df_out['YearBuilt']).astype(object)
    df_out['Has2ndFloor'] = (df_out['2ndFlrSF'] > 0).astype(object)
    df_out['HasGarage'] = (df_out['GarageArea'] > 0).astype(object)
    df_out['YrSold_cat'] = df_out['YrSold'].astype(object)
    df_out['MoSold_cat'] = df_out['MoSold'].astype(object)
    df_out['YearBuilt_cat'] = df_out['YearBuilt'].astype(object)
    df_out['MSSubClass_cat'] = df_out['MSSubClass'].astype(object)
    
    return df_out

feature_engineering_transformer = FunctionTransformer(custom_features)

In [39]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pca = PCA()
new_cols_categorical = pd.Index(['HasRemodeled', 'Has2ndFloor', 'HasGarage'])
new_cols_numeric = pd.Index(['PropertyAge', 'TotalSF', 'TotalBath', 'YrSold_cat', 'MoSold_cat', 'YearBuilt_cat', 'MSSubClass_cat'])

pipeline_fe = Pipeline(steps=[
    ('feature_engineering', feature_engineering_transformer), 
    ('preprocessor', prepocessor),
    ('pca', pca)])

X = train_df.drop('SalePrice', axis=1)
y = np.log(train_df['SalePrice'])
X_preprocessed = pipeline_fe.fit_transform(X)

In [40]:
from sklearn.model_selection import train_test_split
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X_preprocessed, y, test_size=0.2, random_state=SEED)

models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=SEED),
    'RandomForest': RandomForestRegressor(random_state=SEED),
    'XGBoost': XGBRegressor(random_state=SEED)
}

params = {
    'LinearRegression': {},
    'DecisionTree': {'max_depth': [3, 5, 7, 9, 11],
                     'min_samples_split': [2, 3, 10, 15],
                     },
    'RandomForest': {'n_estimators': [100, 200, 500], 
                     'max_depth': [3, 5, 7, 9, 11],
                     'min_samples_split': [2, 3, 4, 5]
                     }, 
    'XGBoost': {'n_estimators': [100, 200, 500], 
                'max_depth': [3, 5, 7, 9, 11],
                'learning_rate': [0.01, 0.1, 0.3],
                }
}
kf = KFold(n_splits=3, shuffle=True, random_state=SEED)
grid_fe = {}

for name, model in models.items():
    print(f'Training {name}')
    grid_fe[name] = GridSearchCV(model, param_grid=params[name], cv=kf,n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')
    grid_fe[name].fit(X_train, y_train)
    best_params = grid_fe[name].best_params_
    best_score = np.sqrt(-1*grid_fe[name].best_score_)
    print(f'Best parameters for {name} : {best_params}')
    print(f'Best RMSE for  {name} : {best_score}\n')

Training LinearRegression
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters for LinearRegression : {}
Best RMSE for  LinearRegression : 45167.754032055964

Training DecisionTree
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters for DecisionTree : {'max_depth': 9, 'min_samples_split': 15}
Best RMSE for  DecisionTree : 40119.3405114763

Training RandomForest
Fitting 3 folds for each of 60 candidates, totalling 180 fits
Best parameters for RandomForest : {'max_depth': 11, 'min_samples_split': 3, 'n_estimators': 200}
Best RMSE for  RandomForest : 31920.938143817602

Training XGBoost
Fitting 3 folds for each of 45 candidates, totalling 135 fits
Best parameters for XGBoost : {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Best RMSE for  XGBoost : 29361.722647415674



In [41]:
X_train_scaled_fe = X_train_fe.copy()
X_test_scaled_fe = X_test_fe.copy()


from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(random_state=42, max_iter=10000, n_iter_no_change=3)

param_grid = {
    'hidden_layer_sizes': [(10,), (10, 10), (10, 25)],
    'activation': ['relu', 'tanh', 'sigmoid'],
    'solver': ['adam', 'sgd'],
    'alpha': [.1, .5, 1, 10, 100],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init' : [0.1]
}


from sklearn.model_selection import GridSearchCV
grid_search_mlp_fe = GridSearchCV(mlp, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=1)

grid_search_mlp_fe.fit(X_train_scaled_fe, y_train_fe)


print("Best parameters found: ", grid_search_mlp_fe.best_params_)


best_score = np.sqrt(-1 * grid_search_mlp_fe.best_score_)
print("Test score: ", best_score)

Fitting 3 folds for each of 270 candidates, totalling 810 fits




365 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
95 fits failed with the following error:
Traceback (most recent call last):
  File "d:\ANACONDA\envs\jupyter-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\ANACONDA\envs\jupyter-env\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\ANACONDA\envs\jupyter-env\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 751, in fit
    return self._fit(X, y, incremental=False)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Best parameters found:  {'activation': 'tanh', 'alpha': 100, 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.1, 'solver': 'sgd'}
Test score:  0.264683809168366


In [42]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
for i in grid_fe.keys():
   print( i + '', np.sqrt(mean_squared_error(y_test, grid_fe[i].predict(X_test))))

LinearRegression 65338.546930128585
DecisionTree 38782.54208819472
RandomForest 29405.753817161414
XGBoost 25103.062282150586


In [43]:
test_df = pd.read_csv(ls + '/test.csv')

In [44]:
%pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [45]:
test_processed = pipeline_fe.transform(test_df)

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values