In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.ticker as mtick
plt.style.use('ggplot')
import seaborn as sns
pd.options.display.float_format = '{:,.0f}'.format
import dataframe_image as dfi
from scipy import stats
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
lm = LinearRegression()
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
encoder = OneHotEncoder(sparse_output=False)
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
#Import Imputeed Ames Data
Ames = pd.read_csv('Ames_HousePrice_Imputed.csv')

#Update with new Ames Data
Ames_Columns_Numeric = Ames[[
    '1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2',
    'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath',
    'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr',
    'LotArea', 'LotFrontage', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'MoSold',
    'OpenPorchSF', 'PoolArea', 'ScreenPorch', 'TotalBsmtSF', 'TotRmsAbvGrd', 'WoodDeckSF',
    'YearBuilt', 'YearRemodAdd', 'YrSold'
]]
Ames_Columns_Ordinal = Ames[[ 
    'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual',
    'Electrical', 'ExterCond', 'ExterQual', 'Fence', 'FireplaceQu', 'Functional',
    'GarageCond', 'GarageFinish', 'GarageQual', 'HeatingQC', 'KitchenQual',
    'LandSlope', 'LotShape', 'OverallCond', 'OverallQual', 'PavedDrive',
    'PoolQC', 'Utilities'
]]
Ames_Columns_Nominal = Ames[[ 
    'Alley', 'BldgType', 'CentralAir', 'Condition1', 'Condition2',
    'Exterior1st', 'Exterior2nd', 'Foundation', 'GarageType', 'Heating',
    'HouseStyle', 'LandContour', 'LotConfig', 'MasVnrType', 'MiscFeature',
    'MSSubClass', 'MSZoning', 'Neighborhood', 'RoofMatl', 'RoofStyle',
    'SaleCondition', 'SaleType', 'Street'
]].astype(str)

#Combination Table of Numeric & Ordinal Columns
Ames_Columns_Numeric_Ordinal = Ames[
    list(Ames_Columns_Numeric.columns) + list(Ames_Columns_Ordinal.columns)
]
#Combination Table of Nominal & Ordinal Columns
Ames_Columns_Nominal_Ordinal = Ames[
    list(Ames_Columns_Nominal.columns) + list(Ames_Columns_Ordinal.columns)
]

In [4]:
Y = Ames['SalePrice']
lm = LinearRegression()
encoder = OneHotEncoder(drop='first', sparse_output=False)

In [5]:
encoder_nominal = OneHotEncoder(drop='first', sparse_output=False)
encoder_ordinal = OneHotEncoder(drop='first', sparse_output=False)
Nominal_cols = Ames_Columns_Nominal.columns
Ordinal_cols = Ames_Columns_Ordinal.columns
Numeric_cols = Ames_Columns_Numeric.columns

# --- One-Hot Encode Nominal ---
Ames_Optimal_Nominal_One_Hot = encoder_nominal.fit_transform(Ames[Nominal_cols])
Ames_Optimal_Nominal_One_Hot_df = pd.DataFrame(
    Ames_Optimal_Nominal_One_Hot,
    columns=encoder_nominal.get_feature_names_out(Nominal_cols),
    index=Ames.index
)
#One-Hot Encode Ordinal
Ames_Optimal_Ordinal_One_Hot = encoder_ordinal.fit_transform(Ames[Ordinal_cols])
Ames_Optimal_Ordinal_One_Hot_df = pd.DataFrame(
    Ames_Optimal_Ordinal_One_Hot,
    columns=encoder_ordinal.get_feature_names_out(Ordinal_cols),
    index=Ames.index
)

#Merge Nominal & Ordinal Encodings
Ames_Optimal_Categorical_One_Hot = pd.merge(
    Ames_Optimal_Nominal_One_Hot_df,
    Ames_Optimal_Ordinal_One_Hot_df,
    left_index=True,
    right_index=True,
    how='left'
)
#Merge with Numeric Features
Ames_MLR_Optimal_Columns = pd.merge(
    Ames[Numeric_cols],
    Ames_Optimal_Categorical_One_Hot,
    left_index=True,
    right_index=True,
    how='left'
)

In [6]:
scaler_Y = StandardScaler()
scaler_X = StandardScaler()
Y_scaled = Y  #scaler_Y.fit_transform(Y.values.reshape(-1, 1)).flatten()
X_scaled = pd.DataFrame(
    scaler_X.fit_transform(Ames_MLR_Optimal_Columns),
    columns=Ames_MLR_Optimal_Columns.columns,
    index=Ames.index
)
#X_scaled = pd.concat([X_numeric_scaled, Ames_Optimal_Categorical_One_Hot], axis=1)

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=2)
pipelines = {
    'Ridge': Pipeline(steps=[('regressor', Ridge())]),
    'Lasso': Pipeline(steps=[('regressor', Lasso(max_iter=100))]),
    'ElasticNet': Pipeline(steps=[('regressor', ElasticNet(max_iter=100))])
}
cv_results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X_scaled, Y_scaled, cv=kf, scoring='r2', n_jobs=-1)
    cv_results[name] = {
        'mean_R2': float(round(scores.mean(), 4)),
        'fold_R2': [float(round(s, 4)) for s in scores]
    }
print('Pipeline Cross_Val_Score Results (alpha=1)')
cv_results

Pipeline Cross_Val_Score Results (alpha=1)


{'Ridge': {'mean_R2': 0.8955,
  'fold_R2': [0.8993, 0.926, 0.8076, 0.9217, 0.9227]},
 'Lasso': {'mean_R2': 0.8972,
  'fold_R2': [0.8985, 0.9265, 0.8156, 0.9218, 0.9237]},
 'ElasticNet': {'mean_R2': 0.8994,
  'fold_R2': [0.8928, 0.92, 0.8519, 0.9115, 0.9206]}}

In [8]:
#Ridge Regression- Optimal Alpha via GridSearchCV
'''
ridge = Ridge()
alpha_ridge_grid = np.linspace(1, 200, 100)

# Setup Grid Search for Ridge
ridge_grid = GridSearchCV(estimator=pipelines['Ridge'],
                          param_grid={'regressor__alpha': alpha_ridge_grid},
                          verbose=1)
ridge_grid.fit(X_scaled, Y_scaled)
 
#Best alpha & score
print('\nRidge Pipeline GridSearchCV Results\n')
ridge_best_alpha = ridge_grid.best_params_['regressor__alpha']
ridge_best_score = ridge_grid.best_score_
print(f"Best Alpha for Ridge: {ridge_best_alpha}")
print(f"Best CV Score: {round(ridge_best_score, 4)}")
'''

'\nridge = Ridge()\nalpha_ridge_grid = np.linspace(1, 200, 100)\n\n# Setup Grid Search for Ridge\nridge_grid = GridSearchCV(estimator=pipelines[\'Ridge\'],\n                          param_grid={\'regressor__alpha\': alpha_ridge_grid},\n                          verbose=1)\nridge_grid.fit(X_scaled, Y_scaled)\n\n#Best alpha & score\nprint(\'\nRidge Pipeline GridSearchCV Results\n\')\nridge_best_alpha = ridge_grid.best_params_[\'regressor__alpha\']\nridge_best_score = ridge_grid.best_score_\nprint(f"Best Alpha for Ridge: {ridge_best_alpha}")\nprint(f"Best CV Score: {round(ridge_best_score, 4)}")\n'

In [9]:
#Ridge Regression- Optimal Alpha via RidgeCV
'''
ridge_cv = RidgeCV(alphas=np.linspace(1, 1000, 100), cv=kf, scoring='r2')
ridge_cv.fit(X_scaled, Y_scaled)

print('RidgeCV Results\n')
alpha_opt_ridge = ridge_cv.alpha_
print(f"Optimal Ridge Alpha: {alpha_opt_ridge:.4f}")

#Fit Ridge with optimal alpha
ridge.set_params(alpha=alpha_opt_ridge)
ridge.fit(X_scaled, Y_scaled)
# Coefficients as a Series with feature names
ridge_coefs = pd.Series(ridge.coef_, index=X_scaled.columns)
ridge.score(X_scaled, Y_scaled)

#Scoring & Coefficients
cv_score_ridge = cross_val_score(ridge, X_scaled, Y_scaled, cv=kf, scoring='r2')
print(f"Mean R²: {cv_score_ridge.mean():.4f};  ", f"Fold R² scores: {', '.join([f'{s:.4f}' for s in cv_score_ridge])}")

ridge_coefs_formatted = ridge_coefs.sort_values(key=abs, ascending=False)
pd.set_option('display.float_format', lambda x: f'{x:.4f}')
print("\nTop 20 largest coefficients (by absolute value):")
print(ridge_coefs_formatted.head(20))

#Zero Coefficients
zero_coefs_ridge = ridge_coefs[ridge_coefs == 0].index.tolist()
print(f"\nNumber of features with zero coefficient: {len(zero_coefs_ridge)}")
print("Zero coefficient features:")
print(zero_coefs_ridge)
'''

'\nridge_cv = RidgeCV(alphas=np.linspace(1, 1000, 100), cv=kf, scoring=\'r2\')\nridge_cv.fit(X_scaled, Y_scaled)\n\nprint(\'RidgeCV Results\n\')\nalpha_opt_ridge = ridge_cv.alpha_\nprint(f"Optimal Ridge Alpha: {alpha_opt_ridge:.4f}")\n\n#Fit Ridge with optimal alpha\nridge.set_params(alpha=alpha_opt_ridge)\nridge.fit(X_scaled, Y_scaled)\n# Coefficients as a Series with feature names\nridge_coefs = pd.Series(ridge.coef_, index=X_scaled.columns)\nridge.score(X_scaled, Y_scaled)\n\n#Scoring & Coefficients\ncv_score_ridge = cross_val_score(ridge, X_scaled, Y_scaled, cv=kf, scoring=\'r2\')\nprint(f"Mean R²: {cv_score_ridge.mean():.4f};  ", f"Fold R² scores: {\', \'.join([f\'{s:.4f}\' for s in cv_score_ridge])}")\n\nridge_coefs_formatted = ridge_coefs.sort_values(key=abs, ascending=False)\npd.set_option(\'display.float_format\', lambda x: f\'{x:.4f}\')\nprint("\nTop 20 largest coefficients (by absolute value):")\nprint(ridge_coefs_formatted.head(20))\n\n#Zero Coefficients\nzero_coefs_ridge =

In [16]:
#Lasso Regression- Optimal Alpha via GridSearchCV
lasso = Lasso()
alpha_lasso_grid = np.linspace(150, 300, 100)
 
# Setup Grid Search for Ridge
lasso_grid = GridSearchCV(estimator=pipelines['Lasso'],
                          param_grid={'regressor__alpha': alpha_lasso_grid},
                          verbose=1)
lasso_grid.fit(X_scaled, Y_scaled)
print('\nLasso Pipeline GridSearchCV Results\n')

#Best alpha & score
lasso_best_alpha = lasso_grid.best_params_['regressor__alpha']
lasso_best_score = lasso_grid.best_score_
print(f"Best Alpha for lasso: {lasso_best_alpha:.6f}")
print(f"Best CV Score {round(lasso_best_score, 4)}")

lasso_cv_results_df = pd.DataFrame(lasso_grid.cv_results_)
lasso_best_row = lasso_cv_results_df.loc[lasso_cv_results_df['param_regressor__alpha'] == lasso_best_alpha]
lasso_fold_scores = [lasso_best_row[f'split{i}_test_score'].values[0] for i in range(5)]
lasso_results_df = pd.DataFrame({
    'Engineering': ['Pre'],
    'Model_Category': ['Penalized_MLR'],
	'Model_Scaling': ['X_Numeric, X_Categorical'],
	'Model': ['Lasso'],
    'Alpha': [lasso_best_alpha],
    'Mean_R²': [lasso_best_score],
    **{f'Fold_{i+1}_R²': [lasso_fold_scores[i]] for i in range(5)}
})

Fitting 5 folds for each of 100 candidates, totalling 500 fits

Lasso Pipeline GridSearchCV Results

Best Alpha for lasso: 281.818182
Best CV Score 0.9161


In [29]:
#Lasso Regression- Optimal Alpha via LassoCV
lasso_cv = LassoCV(alphas=np.linspace(150, 300, 100), cv=kf, n_jobs=-1, max_iter=5000)
lasso_cv.fit(X_scaled, Y_scaled)

print('LassoCV Results\n')
alpha_opt_lasso = lasso_cv.alpha_
print(f"Optimal Lasso Alpha: {alpha_opt_lasso:.4f}")

#Fit Lasso with optimal alpha
lasso = Lasso(alpha=alpha_opt_lasso, max_iter=5000)
lasso.fit(X_scaled, Y_scaled)

lasso_coefs = pd.Series(lasso.coef_, index=X_scaled.columns)
lasso_coefs_formatted = lasso_coefs.sort_values(key=abs, ascending=False)
pd.set_option('display.float_format', lambda x: f'{x:.10f}')

#Scores & Coefficients
cv_scores_lasso = cross_val_score(lasso, X_scaled, Y_scaled, cv=kf, scoring='r2')
print(f"\nMean R² Scores: {cv_scores_lasso.mean():.4f};  ", f"Fold R² scores: {', '.join([f'{s:.4f}' for s in cv_scores_lasso])}")

print("\nTop 20 largest coefficients:")
print(lasso_coefs_formatted.head(20))

#Zero Coefficients
zero_coefs_lasso = lasso_coefs[lasso_coefs == 0].index.tolist()
non_zero_coefs_lasso = lasso_coefs[~lasso_coefs.index.isin(zero_coefs_lasso)]
print(f"\nNumber of features with Non-Zero coefficient: {len(non_zero_coefs_lasso)}")
print(f"\nNumber of features with Zero coefficient: {len(zero_coefs_lasso)}")
print("Zero coefficient features:")
print(zero_coefs_lasso)

LassoCV Results

Optimal Lasso Alpha: 233.3333

Mean R² Scores: 0.9071;   Fold R² scores: 0.8997, 0.9333, 0.8434, 0.9285, 0.9304

Top 20 largest coefficients:
GrLivArea               25841.8335939107
YearBuilt                9204.5546206589
OverallQual_8            8955.0619107594
BsmtFinSF1               8144.4303571773
OverallQual_7            6913.2053214350
TotalBsmtSF              6653.9642813660
OverallQual_9            6508.3724408799
Condition2_PosN         -4591.9942230389
Neighborhood_NoRidge     4532.6847605861
Neighborhood_StoneBr     4386.9566462503
LotArea                  4318.3694827937
Neighborhood_Somerst     4317.7652148681
BsmtQual_5               3886.7832616795
Fireplaces               3562.9301419227
SaleCondition_Partial    3539.8965860514
Neighborhood_NridgHt     3426.2846110220
BsmtExposure_4           3281.4804350486
MasVnrArea               3225.0278478731
ExterQual_3              3186.5739840040
Neighborhood_Crawfor     3162.0888096254
dtype: float64

Numbe

In [39]:
#ElasticNet Regression- Optimal Alpha via GridSearchCV
l1_ratio = np.linspace(0.05, 1, 20)
alpha_elastic_grid = np.linspace(200, 350, 100)

#Setup Grid Search for Ridge
elasticnet_grid = GridSearchCV(estimator=pipelines['ElasticNet'],
                               param_grid={'regressor__alpha': alpha_elastic_grid,
                                           'regressor__l1_ratio': l1_ratio},
                               verbose=1)
elasticnet_grid.fit(X_scaled, Y_scaled)


print('\nElasticNet Pipeline GridSearchCV Results\n')

#Best alpha & score
elasticnet_best_rho = elasticnet_grid.best_params_['regressor__l1_ratio']
elasticnet_best_alpha = elasticnet_grid.best_params_['regressor__alpha']
elasticnet_best_score = elasticnet_grid.best_score_
print(f"Best Rho for ElasticNet: {elasticnet_best_rho:.4f}")
print(f"Best Alpha for ElasticNet: {elasticnet_best_alpha:.6f}")
print(f"Best CV Score {round(elasticnet_best_score, 4)}")

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits

ElasticNet Pipeline GridSearchCV Results

Best Rho for ElasticNet: 1.0000
Best Alpha for ElasticNet: 281.818182
Best CV Score 0.9161


In [37]:
#ElasticNet Regression- Optimal Alpha via ElasticNetCV
elastic_cv = ElasticNetCV(
    l1_ratio=np.linspace(0.05, 1, 20),
    alphas=np.linspace(.001, 1000, 100),
    cv=5, max_iter=5000, n_jobs=-1
)
elastic_cv.fit(X_scaled, Y_scaled)

print('ElasticNetCV Results\n')
best_alpha_elastic = elastic_cv.alpha_
best_l1_elastic = elastic_cv.l1_ratio_
print(f"Optimal ElasticNet l1_ratio: {best_l1_elastic:.2f}")
print(f"Optimal ElasticNet Alpha: {best_alpha_elastic:.4f}")


#Build final ElasticNet model using best parameters
final_en = ElasticNet(alpha=best_alpha_elastic, l1_ratio=best_l1_elastic, max_iter=10000)
final_en.fit(X_scaled, Y_scaled)

#Scires
cv_scores_elasticnet = cross_val_score(final_en, X_scaled, Y_scaled, cv=kf, scoring='r2')
print(f"Mean CV R²: {cv_scores_elasticnet.mean():.4f};  ", f"Fold R² scores: {', '.join([f'{s:.4f}' for s in cv_scores_elasticnet])}\n")

#Coefficient Series
coef_elastic = pd.Series(final_en.coef_, index=X_scaled.columns)
zero_features_elastic = coef_elastic[coef_elastic == 0].index.tolist()
nonzero_features_elastic = coef_elastic[coef_elastic != 0]

top20_elastic = nonzero_features_elastic.reindex(nonzero_features_elastic.abs().sort_values(ascending=False).index).head(20)
pd.set_option('display.float_format', lambda x: f'{x:.4f}')
print("Top 20 largest non-zero coefficients (abs, 4 decimals):")
print(top20_elastic)

#Zero Coefficients
print(f"\nNumber of features with Non-Zero coefficient: {len(nonzero_features_elastic)}")
print(f"\nNumber of features with Zero coefficient: {len(zero_features_elastic)}")
print("Zero coefficient features:")
print(zero_features_elastic)

ElasticNetCV Results

Optimal ElasticNet l1_ratio: 1.00
Optimal ElasticNet Alpha: 282.8290
Mean CV R²: 0.9070;   Fold R² scores: 0.8997, 0.9331, 0.8428, 0.9286, 0.9308

Top 20 largest non-zero coefficients (abs, 4 decimals):
GrLivArea               25870.3623
YearBuilt                8998.7778
OverallQual_8            8960.0450
BsmtFinSF1               7937.8698
OverallQual_7            6945.5436
TotalBsmtSF              6787.1084
OverallQual_9            6547.3435
Condition2_PosN         -4527.7164
Neighborhood_NoRidge     4474.2833
Neighborhood_StoneBr     4290.8286
Neighborhood_Somerst     4170.6844
LotArea                  4138.4414
BsmtQual_5               3943.0908
Fireplaces               3556.6063
SaleCondition_Partial    3434.4722
Neighborhood_NridgHt     3332.2320
BsmtExposure_4           3316.1914
ExterQual_3              3223.9510
Neighborhood_Crawfor     3184.5965
MasVnrArea               3147.3182
dtype: float64

Number of features with Non-Zero coefficient: 162

Number o

In [42]:
#ElasticNet.5 Regression where Rho = .5- Optimal Alpha via GridSearchCV
l1_ratio_5 = [.5]
alpha_elastic5_grid = np.linspace(.01, 1, 100)
 
#Setup Grid Search for Ridge
elasticnet5_grid = GridSearchCV(estimator=pipelines['ElasticNet'],
                               param_grid={'regressor__alpha': alpha_elastic5_grid,
                                           'regressor__l1_ratio': l1_ratio_5},
                               verbose=1)
elasticnet5_grid.fit(X_scaled, Y_scaled)

print('\nElasticNet Pipeline GridSearchCV Results (Rho = .5)\n')

#Best alpha & score
elasticnet5_best_rho = elasticnet5_grid.best_params_['regressor__l1_ratio']
elasticnet5_best_alpha = elasticnet5_grid.best_params_['regressor__alpha']
elasticnet5_best_score = elasticnet5_grid.best_score_
print(f"Best Rho for ElasticNet: {elasticnet5_best_rho:.4f}")
print(f"Best Alpha for ElasticNet: {elasticnet5_best_alpha:.6f}")
print(f"Best CV Score {round(elasticnet5_best_score, 4)}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits

ElasticNet Pipeline GridSearchCV Results (Rho = .5)

Best Rho for ElasticNet: 0.5000
Best Alpha for ElasticNet: 0.160000
Best CV Score 0.9154


In [41]:
#ElasticNetCV where Rho = .5
elastic5_cv = ElasticNetCV(
    l1_ratio=.5,
    alphas=np.linspace(.01, 2, 100),
    cv=5,
    max_iter=5000,
    n_jobs=-1
)
elastic5_cv.fit(X_scaled, Y_scaled)

print('ElasticNetCV Results (Rho = .5)\n')
best_alpha_elastic5 = elastic5_cv.alpha_
best_l1_elastic5 = elastic5_cv.l1_ratio_
print(f"Optimal ElasticNet.5 l1_ratio: {best_l1_elastic5:.2f}")
print(f"Optimal ElasticNet.5 Alpha: {best_alpha_elastic5:.4f}")

#Build final ElasticNet model using best parameters
final_en = ElasticNet(alpha=best_alpha_elastic5, l1_ratio=best_l1_elastic5, max_iter=10000)
final_en.fit(X_scaled, Y_scaled)  #Fit final model

#Scores
cv_scores_elasticnet5 = cross_val_score(final_en, X_scaled, Y_scaled, cv=kf, scoring='r2')
print(f"Mean CV R²: {cv_scores_elasticnet5.mean():.4f};  ", f"Fold R² scores: {', '.join([f'{s:.4f}' for s in cv_scores_elasticnet5])}\n")

#Coefficients as a Series
coef_elastic5 = pd.Series(final_en.coef_, index=X_scaled.columns)

#Summary of coefficients
zero_features_elastic5 = coef_elastic5[coef_elastic5 == 0].index.tolist()
nonzero_features_elastic5 = coef_elastic5[coef_elastic5 != 0]

top20_elastic5 = nonzero_features_elastic5.reindex(nonzero_features_elastic5.abs().sort_values(ascending=False).index).head(20)
pd.set_option('display.float_format', lambda x: f'{x:.4f}')
print("Top 20 largest non-zero coefficients (abs, 4 decimals):")
print(top20_elastic5)

#Zero coefficients
print(f"\nNumber of features with Non-Zero coefficient: {len(nonzero_features_elastic5)}")
print(f"\nNumber of features with Zero coefficient: {len(zero_features_elastic5)}")
print("Zero coefficient features:")
print(zero_features_elastic5)

ElasticNetCV Results (Rho = .5)

Optimal ElasticNet.5 l1_ratio: 0.50
Optimal ElasticNet.5 Alpha: 0.1708
Mean CV R²: 0.9060;   Fold R² scores: 0.8994, 0.9303, 0.8491, 0.9233, 0.9278

Top 20 largest non-zero coefficients (abs, 4 decimals):
GrLivArea              11188.2996
TotalBsmtSF             7281.9911
1stFlrSF                7026.9246
BsmtFinSF1              6902.2835
OverallQual_8           6768.0131
2ndFlrSF                6694.8054
OverallQual_9           5216.9920
YearBuilt               5112.6371
LotArea                 4534.1325
Neighborhood_NoRidge    4505.6109
OverallQual_7           4491.5422
Neighborhood_StoneBr    4211.6765
Fireplaces              4165.7695
Condition2_PosN        -3969.5079
MasVnrArea              3924.8683
BsmtExposure_4          3525.2195
BsmtQual_5              3524.4313
Neighborhood_Somerst    3372.6825
ExterQual_3             3254.7440
OverallQual_4          -3232.0272
dtype: float64

Number of features with Non-Zero coefficient: 296

Number of featu

In [43]:
#Set optimal hyperparameters
pipelines['Ridge'].set_params(regressor__alpha=ridge_best_alpha)
pipelines['Lasso'].set_params(regressor__alpha=lasso_best_alpha)
pipelines['ElasticNet'].set_params(regressor__alpha=elasticnet5_best_alpha, regressor__l1_ratio=0.5)

#Cross-validation
cv_results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X_scaled, Y_scaled, cv=kf, scoring='r2', n_jobs=-1)
    cv_results[name] = {
        'mean_R2': float(round(scores.mean(), 4)),
        'fold_R2': [float(round(s, 4)) for s in scores]
    }
print('Pipeline Cross_Val_Score Results (Optimal Alpha)')
cv_results

Pipeline Cross_Val_Score Results (Optimal Alpha)


{'Ridge': {'mean_R2': 0.906,
  'fold_R2': [0.8995, 0.9304, 0.8489, 0.9234, 0.9278]},
 'Lasso': {'mean_R2': 0.9069,
  'fold_R2': [0.8997, 0.933, 0.8429, 0.9284, 0.9308]},
 'ElasticNet': {'mean_R2': 0.906,
  'fold_R2': [0.8995, 0.9304, 0.8489, 0.9235, 0.9278]}}

In [17]:
lasso_results_df

Unnamed: 0,Engineering,Model_Category,Model_Scaling,Model,Alpha,Mean_R²,Fold_1_R²,Fold_2_R²,Fold_3_R²,Fold_4_R²,Fold_5_R²
0,Pre,Penalized_MLR,"X_Numeric, X_Categorical",Lasso,282,1,1,1,1,1,1


In [18]:
lasso_results_df.to_csv('Penalized_MLR_Scaling_XNumeric_XCategorical.csv', index=False)