In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tabulate import tabulate
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import GridSearchCV


In [2]:
# Load the dataset using Pandas
file_path = r"E:\Deployment of House price\archive\boston.csv"
boston_df = pd.read_csv(file_path)

# Create a new DataFrame 'dataset' using the entire 'boston_df'
dataset = boston_df.copy()

X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [3]:
import pickle

# Assuming 'scaler' is the StandardScaler object
with open('scaling.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [5]:
models = [
    
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('ElasticNet', ElasticNet()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Support Vector Machine', SVR()),
    ('XGBoost', XGBRegressor()),
    ('LightGBM', LGBMRegressor()),
    ('K-Nearest Neighbors', KNeighborsRegressor())
]

In [6]:
results = []

for name, model in models:
    if name == 'Neural Network':
        model.compile(optimizer=Adam(lr=0.0001), loss='mean_squared_error', metrics=['mae', 'mse'])
        history = model.fit(X_train, y_train, epochs=2000, batch_size=128, validation_data=(X_test, y_test), verbose=0)

        evaluation = model.evaluate(X_test, y_test)
        y_pred = model.predict(X_test)
        r_squared = r2_score(y_test, y_pred)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        evaluation = (mean_squared_error(y_test, y_pred), r2_score(y_test, y_pred))

    results.append([name, evaluation[0], evaluation[1]])

# Display results in a table
headers = ["Model", "MSE", "R-Squared"]
print(tabulate(results, headers=headers))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000595 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 880
[LightGBM] [Info] Number of data points in the train set: 354, number of used features: 13
[LightGBM] [Info] Start training from score 23.015819
Model                        MSE    R-Squared
----------------------  --------  -----------
Linear Regression       21.5174      0.711226
Ridge Regression        21.5487      0.710807
Lasso Regression        26.5325      0.643922
ElasticNet              27.4901      0.63107
Decision Tree           11.062       0.851542
Random Forest            9.11007     0.877739
Support Vector Machine  25.9572      0.651643
XGBoost                  9.46656     0.872954
LightGBM                11.7014      0.842962
K-Nearest Neighbors     18.835       0.747225


In [7]:
# Display results for all models
headers = ["Model", "MSE", "R-Squared"]
print(tabulate(results, headers=headers))

# Sort models by closeness to RMSE value of 1
results.sort(key=lambda x: abs(np.sqrt(x[1]) - 1))

# Filter models with RMSE closest to 1
close_models = [(name, mse, r_squared) for name, mse, r_squared in results]

# Print models with RMSE closest to 1
if close_models:
    print("\nModels with RMSE closest to 1:")
    print(tabulate(close_models[:3], headers=headers))
else:
    print("No models found within the specified RMSE threshold.")


Model                        MSE    R-Squared
----------------------  --------  -----------
Linear Regression       21.5174      0.711226
Ridge Regression        21.5487      0.710807
Lasso Regression        26.5325      0.643922
ElasticNet              27.4901      0.63107
Decision Tree           11.062       0.851542
Random Forest            9.11007     0.877739
Support Vector Machine  25.9572      0.651643
XGBoost                  9.46656     0.872954
LightGBM                11.7014      0.842962
K-Nearest Neighbors     18.835       0.747225

Models with RMSE closest to 1:
Model               MSE    R-Squared
-------------  --------  -----------
Random Forest   9.11007     0.877739
XGBoost         9.46656     0.872954
Decision Tree  11.062       0.851542


## Stack the models to obtain better results

In [8]:
# Define the base models
base_models = [
    ('random_forest', RandomForestRegressor()),
    ('xgboost', XGBRegressor())
]

# Create a stacking regressor with the specified base models and a final estimator (Linear Regression)
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())

# Define a more limited parameter grid for tuning
param_grid = {
    'random_forest__n_estimators': [100, 150],
    'random_forest__max_depth': [10, 20],
    'xgboost__n_estimators': [100, 150],
    'xgboost__max_depth': [3, 5],
    'xgboost__learning_rate': [0.1, 0.01],
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(stacking_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [9]:
# Get the best combination based on the mean_test_score
best_combination_index = grid_search.best_index_
best_combination_params = grid_search.cv_results_['params'][best_combination_index]
best_combination_mean_test_score = grid_search.cv_results_['mean_test_score'][best_combination_index]

# Set the best parameters to the stacking regressor
stacking_regressor.set_params(**best_combination_params)

# Fit the stacking regressor with the best parameters on the entire training set
stacking_regressor.fit(X_train, y_train)

# Predict on the test set with the best stacking regressor
y_pred_stacking = stacking_regressor.predict(X_test)

# Calculate final metrics
mse_stacking = mean_squared_error(y_test, y_pred_stacking)
r2_stacking = r2_score(y_test, y_pred_stacking)

# Print the best combination and its metrics
print("\nBest Combination:")
print(f"Parameters: {best_combination_params}")
print(f"Mean Test Score (neg MSE): {best_combination_mean_test_score}")
print("Final Metrics for the Best Combination:")
print("MSE:", mse_stacking)
print("R^2 Score:", r2_stacking)



Best Combination:
Parameters: {'random_forest__max_depth': 10, 'random_forest__n_estimators': 100, 'xgboost__learning_rate': 0.1, 'xgboost__max_depth': 3, 'xgboost__n_estimators': 150}
Mean Test Score (neg MSE): -12.029578812790536
Final Metrics for the Best Combination:
MSE: 8.180223868211112
R^2 Score: 0.8902176348215126


In [10]:

# Choose the best combination of hyperparameters
best_params = grid_search.best_params_

# Set the best hyperparameters to the stacking regressor
stacking_regressor.set_params(**best_params)

# Fit the stacking regressor on the entire training set
stacking_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_stacking = stacking_regressor.predict(X_test)

# Compare predictions with original values
predictions_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_stacking})
print(predictions_comparison)


     Actual  Predicted
173    23.6  23.509431
274    32.4  31.466268
491    13.6  16.090778
72     22.8  24.013067
452    16.1  17.649051
..      ...        ...
441    17.1  12.985337
23     14.5  14.072877
225    50.0  44.787992
433    14.3  16.138062
447    12.6  17.041544

[152 rows x 2 columns]


## Pickling The Model file For Deployment

In [11]:
# Choose the best combination of hyperparameters
best_params = grid_search.best_params_

# Set the best hyperparameters to the stacking regressor
stacking_regressor.set_params(**best_params)

# Fit the stacking regressor on the entire training set
stacking_regressor.fit(X_train, y_train)

# Save the model with the best parameters
pickle.dump(stacking_regressor, open('newregmodel.pkl', 'wb'))


In [12]:
# Load the StackingRegressor model from the existing pickled file
loaded_stacking_regressor = pickle.load(open('newregmodel.pkl', 'rb'))

# Make predictions on the validation set
y_pred_validation = loaded_stacking_regressor.predict(X_test)

# Save the new model with a different pickle name
new_pickle_name = 'new_regmodel.pkl'
pickle.dump(loaded_stacking_regressor, open(new_pickle_name, 'wb'))

# Print the predictions (y_pred_validation) or use them as needed
print("Predictions on Validation Set:", y_pred_validation)


Predictions on Validation Set: [23.61008418 31.40600403 16.12440082 24.0085797  17.74377837 22.59960023
 18.12846321 14.26959303 21.07169744 21.05692898 20.26895876 18.82043829
  6.95340707 21.74563915 19.87329582 24.76990748 19.38474298  9.22871425
 45.81703446 16.68204578 24.30918815 25.23497579 13.73895901 20.38745291
 15.20656098 16.21576852 22.10390681 14.10237274 20.14234513 21.39679662
 19.70023667 23.59272484 19.94155624 20.42028844 14.67366169 16.16786281
 33.44231584 19.41215791 21.55262573 23.82779009 17.27797716 29.74951169
 45.06207988 20.37826897 22.56943334 14.37617034 16.48219331 23.59257649
 18.02731713 27.66486473 20.93262989 37.60125776 16.24447421 25.69307823
 48.56398461 21.78983353 15.77513832 32.18733706 21.87610577 18.06240602
 22.03627446 34.40883896 31.39096524 19.03734975 23.66217428 18.62410789
 14.06750468 23.80282493 28.39618806 14.67132833 21.38504846 25.69103464
 11.58059292 21.45501104 22.4638325   4.95203812 20.87689786 45.06187296
 10.88775919 12.6257