In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Load the preprocessed data
df = pd.read_csv('preprocessed_dataregression.csv')

# Get all feature names
feature_names = df.columns.tolist()
print("Feature Names:", feature_names)

# Split the data into features and target variable
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

# Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Define the parameter grids for each model
param_grids = {
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'GradientBoostingRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 10]
    },
    'KNeighborsRegressor': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree']
    }
}

# Initialize models
models = [
    ('RandomForestRegressor', RandomForestRegressor()),
    ('GradientBoostingRegressor', GradientBoostingRegressor()),
    ('KNeighborsRegressor', KNeighborsRegressor())
]

Feature Names: ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice', 'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'Alley_Grvl', 'Alley_Pave', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_Bnk', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'Utilities_AllPub', 'Utilities_NoSeWa', 'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LandSlope_Gtl', 'LandSlope_Mod', 'LandSlope_Sev', '

In [11]:
# Step 2: Model Training and Hyperparameter Tuning
best_models = {}
for model_name, model in models:
    param_grid = param_grids[model_name]
    
    # Perform Grid Search Cross Validation
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    # Get the best model and its hyperparameters
    best_model_for_type = grid_search.best_estimator_
    best_models[model_name] = best_model_for_type


In [12]:
# Step 3: Model Evaluation
for model_name, best_model_for_type in best_models.items():
    # Evaluate the best model on the validation set
    y_val_pred = best_model_for_type.predict(X_val)
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_rmse = val_mse ** 0.5
    print(f"Validation RMSE for {model_name}: {val_rmse}")


Validation RMSE for RandomForestRegressor: 0.15026932076494762
Validation RMSE for GradientBoostingRegressor: 0.13313709298021686
Validation RMSE for KNeighborsRegressor: 0.23818226898415248


In [13]:
# Step 4: Model Training on Combined Data
X_train_val_combined = pd.concat([X_train, X_val])
y_train_val_combined = pd.concat([y_train, y_val])
# Train each best model on combined training and validation data
for model_name, best_model_for_type in best_models.items():
    best_model_for_type.fit(X_train_val_combined, y_train_val_combined)


In [14]:
# Step 5: Model Evaluation on Test Set
for model_name, best_model_for_type in best_models.items():
    # Evaluate the best model on the test set
    y_test_pred = best_model_for_type.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = test_mse ** 0.5
    print(f"Test RMSE for {model_name}: {test_rmse}")

Test RMSE for RandomForestRegressor: 0.14905587039190546
Test RMSE for GradientBoostingRegressor: 0.1382299377840184
Test RMSE for KNeighborsRegressor: 0.2562878557125928


In [11]:
import joblib

# Save the trained Gradient Boosting Regressor model
joblib.dump(best_models['GradientBoostingRegressor'], 'gradient_boosting_regressor_model.pkl')

['k_neighbors_regressor_model.pkl']