In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tabulate import tabulate
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import GridSearchCV


In [2]:
# Load the dataset using Pandas
file_path = r"E:\Deployment of House price\archive\boston.csv"
boston_df = pd.read_csv(file_path)

# Create a new DataFrame 'dataset' using the entire 'boston_df'
dataset = boston_df.copy()

X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [3]:
import pickle

# Assuming 'scaler' is the StandardScaler object
with open('scaling.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [4]:
models = [
    
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Support Vector Machine', SVR()),
    ('XGBoost', XGBRegressor()),
    ('LightGBM', LGBMRegressor()),
    ('K-Nearest Neighbors', KNeighborsRegressor())
]

In [5]:
results = []

for name, model in models:
    # Fit the model directly
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    evaluation = (mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred))

    results.append([name, evaluation[0], evaluation[1]])


# Display results in a table
headers = ["Model", "MSE", "R-Squared"]
print(tabulate(results, headers=headers))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 880
[LightGBM] [Info] Number of data points in the train set: 354, number of used features: 13
[LightGBM] [Info] Start training from score 23.015819
Model                        MSE    R-Squared
----------------------  --------  -----------
Linear Regression       21.5174      0.711226
Ridge Regression        21.5487      0.710807
Lasso Regression        26.5325      0.643922
ElasticNet              27.4901      0.63107
Decision Tree           19.6668      0.736062
Random Forest            9.60433     0.871106
Support Vector Machine  25.9572      0.651643
XGBoost                  9.46656     0.872954
LightGBM                11.7014      0.842962
K-Nearest Neighbors     18.835       0.747225


In [6]:
# Display results for all models
headers = ["Model", "MSE", "R-Squared"]
print(tabulate(results, headers=headers))

# Sort models by closeness to RMSE value of 1
results.sort(key=lambda x: abs(np.sqrt(x[1]) - 1))

# Filter models with RMSE closest to 1
close_models = [(name, mse, r_squared) for name, mse, r_squared in results]

# Print models with RMSE closest to 1
if close_models:
    print("\nModels with RMSE closest to 1:")
    print(tabulate(close_models[:3], headers=headers))
else:
    print("No models found within the specified RMSE threshold.")


Model                        MSE    R-Squared
----------------------  --------  -----------
Linear Regression       21.5174      0.711226
Ridge Regression        21.5487      0.710807
Lasso Regression        26.5325      0.643922
ElasticNet              27.4901      0.63107
Decision Tree           19.6668      0.736062
Random Forest            9.60433     0.871106
Support Vector Machine  25.9572      0.651643
XGBoost                  9.46656     0.872954
LightGBM                11.7014      0.842962
K-Nearest Neighbors     18.835       0.747225

Models with RMSE closest to 1:
Model               MSE    R-Squared
-------------  --------  -----------
XGBoost         9.46656     0.872954
Random Forest   9.60433     0.871106
LightGBM       11.7014      0.842962


## Stack the models to obtain better results

In [7]:
# Define the base models
base_models = [
    ('random_forest', RandomForestRegressor()),
    ('xgboost', XGBRegressor())
]

# Create a stacking regressor with the specified base models and a final estimator (Linear Regression)
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())

# Define a more limited parameter grid for tuning
param_grid = {
    'random_forest__n_estimators': [100, 150],
    'random_forest__max_depth': [10, 20],
    'xgboost__n_estimators': [100, 150],
    'xgboost__max_depth': [3, 5],
    'xgboost__learning_rate': [0.1, 0.01],
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(stacking_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [8]:
# Get the best combination based on the mean_test_score
best_combination_index = grid_search.best_index_
best_combination_params = grid_search.cv_results_['params'][best_combination_index]
best_combination_mean_test_score = grid_search.cv_results_['mean_test_score'][best_combination_index]

# Set the best parameters to the stacking regressor
stacking_regressor.set_params(**best_combination_params)

# Fit the stacking regressor with the best parameters on the entire training set
stacking_regressor.fit(X_train, y_train)

# Predict on the test set with the best stacking regressor
y_pred_stacking = stacking_regressor.predict(X_test)

# Calculate final metrics
mse_stacking = mean_squared_error(y_test, y_pred_stacking)
r2_stacking = r2_score(y_test, y_pred_stacking)

# Print the best combination and its metrics
print("\nBest Combination:")
print(f"Parameters: {best_combination_params}")
print(f"Mean Test Score (neg MSE): {best_combination_mean_test_score}")
print("Final Metrics for the Best Combination:")
print("MSE:", mse_stacking)
print("R^2 Score:", r2_stacking)



Best Combination:
Parameters: {'random_forest__max_depth': 10, 'random_forest__n_estimators': 150, 'xgboost__learning_rate': 0.1, 'xgboost__max_depth': 3, 'xgboost__n_estimators': 150}
Mean Test Score (neg MSE): -12.121211076219595
Final Metrics for the Best Combination:
MSE: 8.203276172449613
R^2 Score: 0.8899082623003092


In [9]:

# Choose the best combination of hyperparameters
best_params = grid_search.best_params_

# Set the best hyperparameters to the stacking regressor
stacking_regressor.set_params(**best_params)

# Fit the stacking regressor on the entire training set
stacking_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_stacking = stacking_regressor.predict(X_test)

# Compare predictions with original values
predictions_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_stacking})
print(predictions_comparison)


     Actual  Predicted
173    23.6  23.543390
274    32.4  31.446933
491    13.6  16.085124
72     22.8  23.998505
452    16.1  17.633155
..      ...        ...
441    17.1  13.017324
23     14.5  14.107064
225    50.0  44.615786
433    14.3  16.167190
447    12.6  17.039876

[152 rows x 2 columns]


## Pickling The Model file For Deployment

In [10]:

import sklearn
print(sklearn.__version__)

1.3.0


In [11]:
# Choose the best combination of hyperparameters
best_params = grid_search.best_params_

# Set the best hyperparameters to the stacking regressor
stacking_regressor.set_params(**best_params)

# Fit the stacking regressor on the entire training set
stacking_regressor.fit(X_train, y_train)

# Save the model with the best parameters
pickle.dump(stacking_regressor, open('newregmodel.pkl', 'wb'))


In [12]:

import sklearn
print(sklearn.__version__)

1.3.0


In [13]:
# Load the StackingRegressor model from the existing pickled file
loaded_stacking_regressor = pickle.load(open('newregmodel.pkl', 'rb'))

# Make predictions on the validation set
y_pred_validation = loaded_stacking_regressor.predict(X_test)

# Save the new model with a different pickle name
new_pickle_name = 'new_regmodel.pkl'
pickle.dump(loaded_stacking_regressor, open(new_pickle_name, 'wb'))

# Print the predictions (y_pred_validation) or use them as needed
print("Predictions on Validation Set:", y_pred_validation)


Predictions on Validation Set: [23.49598379 31.4182662  16.05124193 24.01286022 17.64187457 22.51098589
 18.26764595 14.27383529 21.05971863 21.03739103 20.22354749 18.85673811
  6.9694728  21.76658815 19.83380524 24.85896574 19.34245913  9.07041211
 46.00216477 16.51095906 24.33633689 25.10950472 13.75779361 20.74196481
 15.14265098 16.08307783 22.03020445 14.03847566 20.04026945 21.3449875
 19.67877351 23.56386576 21.097059   20.39144489 14.60713992 16.08644247
 33.66550887 19.3815015  21.4945998  23.86614816 17.38989579 29.77024116
 45.2879043  20.28471442 22.54351007 14.2707451  16.32685676 23.69170576
 18.12952695 27.69445317 20.94045625 37.32333368 16.27878451 25.81558923
 48.4746817  21.79225483 15.73243844 32.28191825 21.8934844  18.2913748
 22.36834136 34.3924179  31.27821493 19.06186594 24.02787128 18.4568207
 13.95346818 23.74226321 28.44770162 14.71200015 21.30304427 26.04858185
 11.36688259 21.50144945 22.44399049  5.00696281 20.78995092 45.32479879
 10.82939753 12.5044796