In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tabulate import tabulate
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import GridSearchCV


In [17]:
# Load the dataset using Pandas
file_path = r"E:\Deployment of House price\archive\boston.csv"
boston_df = pd.read_csv(file_path)

# Create a new DataFrame 'dataset' using the entire 'boston_df'
dataset = boston_df.copy()

X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [18]:
import pickle

# Assuming 'scaler' is the StandardScaler object
with open('scaling.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [19]:
models = [
    ('Neural Network', Sequential([
        Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1)
    ])),
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Support Vector Machine', SVR()),
    ('XGBoost', XGBRegressor()),
    ('LightGBM', LGBMRegressor()),
    ('K-Nearest Neighbors', KNeighborsRegressor())
]

In [20]:
results = []

for name, model in models:
    if name == 'Neural Network':
        model.compile(optimizer=Adam(lr=0.0001), loss='mean_squared_error', metrics=['mae', 'mse'])
        history = model.fit(X_train, y_train, epochs=2000, batch_size=128, validation_data=(X_test, y_test), verbose=0)

        evaluation = model.evaluate(X_test, y_test)
        y_pred = model.predict(X_test)
        r_squared = r2_score(y_test, y_pred)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        evaluation = (mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred))

    results.append([name, evaluation[0], evaluation[1]])

# Display results in a table
headers = ["Model", "MSE", "R-Squared"]
print(tabulate(results, headers=headers))

  super().__init__(name, **kwargs)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000095 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 880
[LightGBM] [Info] Number of data points in the train set: 354, number of used features: 13
[LightGBM] [Info] Start training from score 23.015819
Model                        MSE    R-Squared
----------------------  --------  -----------
Neural Network          11.3703      2.2062
Linear Regression       21.5174      0.711226
Ridge Regression        21.5487      0.710807
Lasso Regression        26.5325      0.643922
ElasticNet              27.4901      0.63107
Decision Tree           18.5411      0.75117
Random Forest            9.71771     0.869584
Support Vector Machine  25.9572      0.651643
XGBoost                  9.57647     0.871479
LightGBM                11.7014      0.842962
K-Nearest Neighbors     18.835       0.747225


In [21]:
# Display results for all models
headers = ["Model", "MSE", "R-Squared"]
print(tabulate(results, headers=headers))

# Sort models by closeness to RMSE value of 1
results.sort(key=lambda x: abs(np.sqrt(x[1]) - 1))

# Filter models with RMSE closest to 1
close_models = [(name, mse, r_squared) for name, mse, r_squared in results]

# Print models with RMSE closest to 1
if close_models:
    print("\nModels with RMSE closest to 1:")
    print(tabulate(close_models[:3], headers=headers))
else:
    print("No models found within the specified RMSE threshold.")


Model                        MSE    R-Squared
----------------------  --------  -----------
Neural Network          11.3703      2.2062
Linear Regression       21.5174      0.711226
Ridge Regression        21.5487      0.710807
Lasso Regression        26.5325      0.643922
ElasticNet              27.4901      0.63107
Decision Tree           18.5411      0.75117
Random Forest            9.71771     0.869584
Support Vector Machine  25.9572      0.651643
XGBoost                  9.57647     0.871479
LightGBM                11.7014      0.842962
K-Nearest Neighbors     18.835       0.747225

Models with RMSE closest to 1:
Model                MSE    R-Squared
--------------  --------  -----------
XGBoost          9.57647     0.871479
Random Forest    9.71771     0.869584
Neural Network  11.3703      2.2062


## Stack the models to obtain better results

In [22]:
# Define the base models
base_models = [
    ('random_forest', RandomForestRegressor()),
    ('xgboost', XGBRegressor())
]

# Create a stacking regressor with the specified base models and a final estimator (Linear Regression)
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())

# Define a more limited parameter grid for tuning
param_grid = {
    'random_forest__n_estimators': [100, 150],
    'random_forest__max_depth': [10, 20],
    'xgboost__n_estimators': [100, 150],
    'xgboost__max_depth': [3, 5],
    'xgboost__learning_rate': [0.1, 0.01],
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(stacking_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [23]:
# Get the best combination based on the mean_test_score
best_combination_index = grid_search.best_index_
best_combination_params = grid_search.cv_results_['params'][best_combination_index]
best_combination_mean_test_score = grid_search.cv_results_['mean_test_score'][best_combination_index]

# Set the best parameters to the stacking regressor
stacking_regressor.set_params(**best_combination_params)

# Fit the stacking regressor with the best parameters on the entire training set
stacking_regressor.fit(X_train, y_train)

# Predict on the test set with the best stacking regressor
y_pred_stacking = stacking_regressor.predict(X_test)

# Calculate final metrics
mse_stacking = mean_squared_error(y_test, y_pred_stacking)
r2_stacking = r2_score(y_test, y_pred_stacking)

# Print the best combination and its metrics
print("\nBest Combination:")
print(f"Parameters: {best_combination_params}")
print(f"Mean Test Score (neg MSE): {best_combination_mean_test_score}")
print("Final Metrics for the Best Combination:")
print("MSE:", mse_stacking)
print("R^2 Score:", r2_stacking)



Best Combination:
Parameters: {'random_forest__max_depth': 20, 'random_forest__n_estimators': 100, 'xgboost__learning_rate': 0.1, 'xgboost__max_depth': 3, 'xgboost__n_estimators': 150}
Mean Test Score (neg MSE): -11.929834509741003
Final Metrics for the Best Combination:
MSE: 9.024429949021195
R^2 Score: 0.8788880010923491


In [24]:

# Choose the best combination of hyperparameters
best_params = grid_search.best_params_

# Set the best hyperparameters to the stacking regressor
stacking_regressor.set_params(**best_params)

# Fit the stacking regressor on the entire training set
stacking_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_stacking = stacking_regressor.predict(X_test)

# Compare predictions with original values
predictions_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_stacking})
print(predictions_comparison)


     Actual  Predicted
173    23.6  23.550480
274    32.4  32.198074
491    13.6  15.699641
72     22.8  23.610766
452    16.1  17.561221
..      ...        ...
441    17.1  12.889954
23     14.5  14.074192
225    50.0  44.318905
433    14.3  15.563411
447    12.6  16.478708

[152 rows x 2 columns]


## Pickling The Model file For Deployment

In [25]:
# Choose the best combination of hyperparameters
best_params = grid_search.best_params_

# Set the best hyperparameters to the stacking regressor
stacking_regressor.set_params(**best_params)

# Fit the stacking regressor on the entire training set
stacking_regressor.fit(X_train, y_train)

# Save the model with the best parameters
pickle.dump(stacking_regressor, open('newregmodel.pkl', 'wb'))


In [26]:
# Load the StackingRegressor model from the existing pickled file
loaded_stacking_regressor = pickle.load(open('newregmodel.pkl', 'rb'))

# Make predictions on the validation set
y_pred_validation = loaded_stacking_regressor.predict(X_test)

# Save the new model with a different pickle name
new_pickle_name = 'new_regmodel.pkl'
pickle.dump(loaded_stacking_regressor, open(new_pickle_name, 'wb'))

# Print the predictions (y_pred_validation) or use them as needed
print("Predictions on Validation Set:", y_pred_validation)


Predictions on Validation Set: [23.52552528 32.36962568 15.67104895 23.57480966 17.5137104  22.54282983
 18.19964208 13.99038576 20.8701912  21.21487172 20.45106833 18.55533192
  8.96299899 21.90066038 20.14704567 25.02905101 19.27730862  9.93028659
 46.59943392 16.23673783 24.20440479 25.03789791 13.80104415 20.81110842
 15.95545536 16.53801477 21.97741932 13.31371236 19.91073333 21.391
 19.65848025 23.70723487 23.60437502 20.50604325 14.38435921 16.86322676
 33.51466483 19.62476095 21.82105274 23.50097488 17.71965469 30.02671107
 45.16668515 19.93343015 22.55530281 15.24299576 16.23594869 23.58214381
 18.44870189 27.02727137 20.66684782 36.7665558  16.75196108 25.87198678
 48.86774704 21.68607571 16.61201876 32.13197971 22.27939734 18.75395745
 22.75552239 35.37543731 32.4971282  20.03329676 25.52665909 16.92402613
 14.43613425 23.34770096 29.08857576 14.3602627  20.88033245 28.95765304
  9.9204531  20.85717982 22.45091039  6.97027702 20.69558889 45.57353054
 11.37700939 11.97922385 