House Price Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lazypredict.Supervised import LazyRegressor 
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score


In [5]:
california_housing = fetch_california_housing(as_frame=True)

In [9]:
df=california_housing.frame

In [10]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.33,41.00,6.98,1.02,322.00,2.56,37.88,-122.23,4.53
1,8.30,21.00,6.24,0.97,2401.00,2.11,37.86,-122.22,3.58
2,7.26,52.00,8.29,1.07,496.00,2.80,37.85,-122.24,3.52
3,5.64,52.00,5.82,1.07,558.00,2.55,37.85,-122.25,3.41
4,3.85,52.00,6.28,1.08,565.00,2.18,37.85,-122.25,3.42
...,...,...,...,...,...,...,...,...,...
20635,1.56,25.00,5.05,1.13,845.00,2.56,39.48,-121.09,0.78
20636,2.56,18.00,6.11,1.32,356.00,3.12,39.49,-121.21,0.77
20637,1.70,17.00,5.21,1.12,1007.00,2.33,39.43,-121.22,0.92
20638,1.87,18.00,5.33,1.17,741.00,2.12,39.43,-121.32,0.85


In [12]:
df.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [13]:
df.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
MedInc,1.0,-0.12,0.33,-0.06,0.0,0.02,-0.08,-0.02,0.69
HouseAge,-0.12,1.0,-0.15,-0.08,-0.3,0.01,0.01,-0.11,0.11
AveRooms,0.33,-0.15,1.0,0.85,-0.07,-0.0,0.11,-0.03,0.15
AveBedrms,-0.06,-0.08,0.85,1.0,-0.07,-0.01,0.07,0.01,-0.05
Population,0.0,-0.3,-0.07,-0.07,1.0,0.07,-0.11,0.1,-0.02
AveOccup,0.02,0.01,-0.0,-0.01,0.07,1.0,0.0,0.0,-0.02
Latitude,-0.08,0.01,0.11,0.07,-0.11,0.0,1.0,-0.92,-0.14
Longitude,-0.02,-0.11,-0.03,0.01,0.1,0.0,-0.92,1.0,-0.05
MedHouseVal,0.69,0.11,0.15,-0.05,-0.02,-0.02,-0.14,-0.05,1.0


In [14]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.87,28.64,5.43,1.1,1425.48,3.07,35.63,-119.57,2.07
std,1.9,12.59,2.47,0.47,1132.46,10.39,2.14,2.0,1.15
min,0.5,1.0,0.85,0.33,3.0,0.69,32.54,-124.35,0.15
25%,2.56,18.0,4.44,1.01,787.0,2.43,33.93,-121.8,1.2
50%,3.53,29.0,5.23,1.05,1166.0,2.82,34.26,-118.49,1.8
75%,4.74,37.0,6.05,1.1,1725.0,3.28,37.71,-118.01,2.65
max,15.0,52.0,141.91,34.07,35682.0,1243.33,41.95,-114.31,5.0


In [15]:
x = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [19]:
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)

In [None]:
regModels, regPredictions = reg.fit(x_train, x_test, y_train, y_test)

 24%|██▍       | 10/42 [00:11<00:41,  1.29s/it]

![executed in collab](hsp_lazypredict_regression.png)

In [None]:
print(regModels)

bestModel = regModels.index[0]

bestModel_r2 = regModels.iloc[0]['R-Squared']

print(f"\nLazyPredict's Best Regression Model: {bestModel} with R-squared: {bestModel_r2:.4f}")

#results from collab
'''                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
LGBMRegressor                                0.84       0.84  0.46        0.34
HistGradientBoostingRegressor                0.84       0.84  0.46        0.45
XGBRegressor                                 0.83       0.83  0.47        0.46
ExtraTreesRegressor                          0.81       0.81  0.50        6.38
RandomForestRegressor                        0.80       0.81  0.51       18.26
BaggingRegressor                             0.78       0.78  0.53        1.78
MLPRegressor                                 0.78       0.78  0.54       19.70
GradientBoostingRegressor                    0.78       0.78  0.54        4.97
NuSVR                                        0.73       0.73  0.59       19.30
SVR                                          0.73       0.73  0.60       15.86
KNeighborsRegressor                          0.67       0.67  0.66        0.38
DecisionTreeRegressor                        0.62       0.62  0.70        0.34
SGDRegressor                                 0.58       0.58  0.74        0.09
OrthogonalMatchingPursuitCV                  0.58       0.58  0.74        0.04
ElasticNetCV                                 0.58       0.58  0.74        0.26
LassoCV                                      0.58       0.58  0.74        0.26
LarsCV                                       0.58       0.58  0.74        0.06
LassoLarsCV                                  0.58       0.58  0.74        0.09
BayesianRidge                                0.58       0.58  0.75        0.05
Ridge                                        0.57       0.58  0.75        0.02
RidgeCV                                      0.57       0.58  0.75        0.09
Lars                                         0.57       0.58  0.75        0.03
TransformedTargetRegressor                   0.57       0.58  0.75        0.03
LassoLarsIC                                  0.57       0.58  0.75        0.03
LinearRegression                             0.57       0.58  0.75        0.04
HuberRegressor                               0.56       0.56  0.76        0.12
LinearSVR                                    0.56       0.56  0.76        1.22
ExtraTreeRegressor                           0.54       0.54  0.77        0.16
AdaBoostRegressor                            0.53       0.53  0.78        0.80
OrthogonalMatchingPursuit                    0.46       0.46  0.84        0.02
PoissonRegressor                             0.44       0.44  0.86        0.03
RANSACRegressor                              0.40       0.40  0.89        0.25
TweedieRegressor                             0.39       0.39  0.90        0.05
GammaRegressor                               0.37       0.37  0.91        0.04
ElasticNet                                   0.20       0.20  1.02        0.02
DummyRegressor                              -0.00      -0.00  1.14        0.02
LassoLars                                   -0.00      -0.00  1.14        0.02
Lasso                                       -0.00      -0.00  1.14        0.03
QuantileRegressor                           -0.05      -0.05  1.17       11.68
PassiveAggressiveRegressor                  -0.27      -0.27  1.29        0.04
KernelRidge                                 -2.72      -2.71  2.21       62.89
GaussianProcessRegressor                 -1868.07   -1864.45 49.44      174.08

LazyPredict's Best Regression Model: LGBMRegressor with R-squared: 0.8386
'''

In [1]:
from lightgbm import LGBMRegressor

In [None]:
lgbm_regressor = LGBMRegressor(random_state=42)

In [None]:
grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [ 0.05, 0.1], 
    'num_leaves': [31, 50],
    'max_depth': [1, 5],
    'min_child_samples': [20, 30], 
}

In [None]:
lgbmGS = GridSearchCV(estimator=lgbm_regressor,param_grid=grid,scoring='neg_mean_squared_error',cv=5,verbose=1,n_jobs=8)

In [None]:
lgbmGS.fit(x_train, y_train)

itting 5 folds for each of 48 candidates, totalling 240 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947

In [None]:
bestModel = lgbmGS.best_estimator_  
bestParameters=lgbmGS.best_params_

print("Best Parameters for LGBMRegressor:", bestParameters)
print("\nBest Model for LGBMRegressor:", bestModel)


In [None]:
y_pred = bestModel.predict(x_test)   

In [None]:
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))  
print(f"Test RMSE: {rmse_test:.4f}")  
r2_test = r2_score(y_test, y_pred)  
print(f"Test R²: {r2_test:.4f}") 


In [None]:
accuracy = bestModel.score(x_test, y_test)  
print(f"Final Accuracy: {accuracy:.4f}")  