In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv('cleaned_canada.csv')

In [None]:
df_preprocess = df.copy()

Data Preprocessing

In [None]:
# Drop features with too many missing value
missing_thresh = 0.6
drop_cols = [col for col in df_preprocess.columns if df_preprocess[col].isnull().mean() > missing_thresh]
df_preprocess.drop(columns=drop_cols, inplace=True)
df_preprocess.drop(columns=['Acreage'], inplace=True)

In [None]:
df_preprocess_L = df_preprocess[df_preprocess['Price'] <= 2500000].copy()

def categorize_province(province):
    if province in ['ON', 'QC', 'BC']:
        return 'High Population'
    elif province in ['AB', 'SK', 'MB']:
        return 'Resource Rich'
    else:
        return 'Small Population'

df_preprocess_L['Area'] = df_preprocess_L['Province'].apply(categorize_province)

In [None]:
df_preprocess_L.drop(columns=['Latitude', 'Longitude','Province'], inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
num_cols = df_preprocess_L.drop(columns=['Price']).select_dtypes(include=["float64", "int"]).columns.tolist()
cat_cols = df_preprocess_L.select_dtypes(include=["object"]).columns.tolist()
# deal with missing values
imputer = SimpleImputer(strategy='most_frequent')
df_preprocess_L[cat_cols] = imputer.fit_transform(df_preprocess_L[cat_cols])
# OneHot Coding
df_preprocess_L = pd.get_dummies(df_preprocess_L, columns=cat_cols, dummy_na=False)
# Standard scaling
scaler = StandardScaler()
df_preprocess_L[num_cols] = scaler.fit_transform(df_preprocess_L[num_cols])

Models (All features)

In [None]:
df_preprocess_H = df_preprocess_L[df_preprocess_L['Area_High Population'] == 1].copy()
df_preprocess_R = df_preprocess_L[df_preprocess_L['Area_Resource Rich'] == 1].copy()
df_preprocess_S = df_preprocess_L[df_preprocess_L['Area_Small Population'] == 1].copy()
df_seperated = [df_preprocess_H, df_preprocess_R, df_preprocess_S]

In [None]:
# linear model
from sklearn.linear_model import LinearRegression, Ridge
for df in df_seperated:
  X = df.drop(columns=['Price'])
  y = df['Price']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  model_lr = LinearRegression()
  model_lr.fit(X_train, y_train)
  y_pred_lr = model_lr.predict(X_test)
  mse_lr = mean_squared_error(y_test, y_pred_lr)
  rmse_lr = np.sqrt(mse_lr)
  r2_lr = r2_score(y_test, y_pred_lr)
  norm_mse_lr = mse_lr / (max(y_test)-min(y_test))
  print(f"Linear Regression - MSE: {mse_lr}, RMSE: {rmse_lr}, R2: {r2_lr}, NormalizedMSE: {norm_mse_lr}")


Linear Regression - MSE: 85211238864.75107, RMSE: 291909.6416097815, R2: 0.7055210476827358, NormalizedMSE: 34849.797089996755
Linear Regression - MSE: 28747439126.10254, RMSE: 169550.69780482337, R2: 0.7321129882290086, NormalizedMSE: 11733.648622898996
Linear Regression - MSE: 50053367245.06465, RMSE: 223726.09871238683, R2: 0.46543294233735044, NormalizedMSE: 20463.355374106563


In [None]:
# Ridge
from sklearn.linear_model import Ridge
for df in df_seperated:
  X = df.drop(columns=['Price'])
  y = df['Price']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  model_ridge = Ridge(alpha=1.0)
  model_ridge.fit(X_train, y_train)
  y_pred_ridge = model_ridge.predict(X_test)
  mse_ridge = mean_squared_error(y_test, y_pred_ridge)
  rmse_ridge = np.sqrt(mse_ridge)
  r2_ridge = r2_score(y_test, y_pred_ridge)
  print(f"Ridge Regression - MSE: {mse_ridge}, RMSE: {rmse_ridge}, R2: {r2_ridge}")

Ridge Regression - MSE: 85504837572.71577, RMSE: 292412.1023020692, R2: 0.7045064087562865
Ridge Regression - MSE: 28413228076.30286, RMSE: 168562.23799031283, R2: 0.735227380402828
Ridge Regression - MSE: 47455169390.93618, RMSE: 217842.0744276371, R2: 0.49318154465029107


In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
for df in df_seperated:
  X = df.drop(columns=['Price'])
  y = df['Price']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  model_dt = DecisionTreeRegressor(random_state=42)
  model_dt.fit(X_train, y_train)
  y_pred_dt = model_dt.predict(X_test)
  mse_dt = mean_squared_error(y_test, y_pred_dt)
  rmse_dt = np.sqrt(mse_dt)
  r2_dt = r2_score(y_test, y_pred_dt)
  print(f"Decision Tree - MSE: {mse_dt}, RMSE: {rmse_dt}, R2: {r2_dt}")

Decision Tree - MSE: 106130382984.83803, RMSE: 325776.5844637058, R2: 0.6332272079741649
Decision Tree - MSE: 29359094003.932255, RMSE: 171344.95616717832, R2: 0.726413197136724
Decision Tree - MSE: 60323208860.87674, RMSE: 245607.83550383066, R2: 0.3557516298225083


In [None]:
# boosted tree
from sklearn.ensemble import GradientBoostingRegressor
for df in df_seperated:
  X = df.drop(columns=['Price'])
  y = df['Price']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  model_gb = GradientBoostingRegressor(random_state=42)
  model_gb.fit(X_train, y_train)
  y_pred_gb = model_gb.predict(X_test)
  mse_gb = mean_squared_error(y_test, y_pred_gb)
  rmse_gb = np.sqrt(mse_gb)
  r2_gb = r2_score(y_test, y_pred_gb)
  print(f"Gradient Boosting - MSE: {mse_gb}, RMSE: {rmse_gb}, R2: {r2_gb}")

Gradient Boosting - MSE: 82694836580.77115, RMSE: 287567.0992668861, R2: 0.7142174064972269
Gradient Boosting - MSE: 30060021685.566357, RMSE: 173378.26185991816, R2: 0.7198815049996656
Gradient Boosting - MSE: 43183030110.91613, RMSE: 207805.26968995787, R2: 0.5388077442556838


In [None]:
# Random forest
from sklearn.ensemble import RandomForestRegressor
for df in df_seperated:
    X = df.drop(columns=['Price'])
    y = df['Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model_rf = RandomForestRegressor(n_estimators=500, random_state=42)
    model_rf.fit(X_train, y_train)
    y_pred_rf = model_rf.predict(X_test)

    mse_rf = mean_squared_error(y_test, y_pred_rf)
    rmse_rf = np.sqrt(mse_rf)
    r2_rf = r2_score(y_test, y_pred_rf)

    print(f"Random Forest - MSE: {mse_rf}, RMSE: {rmse_rf}, R2: {r2_rf}")


Random Forest - MSE: 63768002117.929695, RMSE: 252523.27044834837, R2: 0.7796260833050631
Random Forest - MSE: 19118491275.74569, RMSE: 138269.63251468376, R2: 0.8218416786635128
Random Forest - MSE: 43195334124.8937, RMSE: 207834.87225413762, R2: 0.5386763381003851


Feature Selection

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

# Feature Selection (example using SelectKBest with f_regression)
for df in df_seperated:
    X = df.drop(columns=['Price'])
    y = df['Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Select top 550 features
    selector = SelectKBest(score_func=f_regression, k=550)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Linear Regression with selected featuress
    model_lr_f = LinearRegression()
    model_lr_f.fit(X_train_selected, y_train)
    y_pred_lr_f = model_lr_f.predict(X_test_selected)
    mse_lr_f = mean_squared_error(y_test, y_pred_lr_f)
    rmse_lr_f = np.sqrt(mse_lr_f)
    r2_lr_f = r2_score(y_test, y_pred_lr_f)
    print(f"Linear Regression (with Feature Selection) - MSE: {mse_lr_f}, RMSE: {rmse_lr_f}, R2: {r2_lr_f}")

    # Ridge Regression with selected features
    model_ridge_f = Ridge(alpha=1.0)
    model_ridge_f.fit(X_train_selected, y_train)
    y_pred_ridge_f = model_ridge_f.predict(X_test_selected)
    mse_ridge_f = mean_squared_error(y_test, y_pred_ridge_f)
    rmse_ridge_f = np.sqrt(mse_ridge_f)
    r2_ridge_f = r2_score(y_test, y_pred_ridge_f)
    print(f"Ridge Regression (with Feature Selection) - MSE: {mse_ridge_f}, RMSE: {rmse_ridge_f}, R2: {r2_ridge_f}")

    # Decision Tree with selected features
    model_dt_f = DecisionTreeRegressor(random_state=42)
    model_dt_f.fit(X_train_selected, y_train)
    y_pred_dt_f = model_dt_f.predict(X_test_selected)
    mse_dt_f = mean_squared_error(y_test, y_pred_dt_f)
    rmse_dt_f = np.sqrt(mse_dt_f)
    r2_dt_f = r2_score(y_test, y_pred_dt_f)
    print(f"Decision Tree (with Feature Selection) - MSE: {mse_dt_f}, RMSE: {rmse_dt_f}, R2: {r2_dt_f}")

    # Random Forest with Bagging with selected features
    model_rf_f = RandomForestRegressor(n_estimators=500,random_state=42)
    model_rf_f.fit(X_train,y_train)
    y_pred_rf_f = model_rf_f.predict(X_test)
    mse_rf_f = mean_squared_error(y_test, y_pred_rf_f)
    rmse_rf_f = np.sqrt(mse_rf_f)
    r2_rf_f = r2_score(y_test, y_pred_rf_f)
    print(f"Random Forest (with Feature Selection) - MSE: {mse_rf_f}, MSE: {rmse_rf_f}, R2: {r2_rf_f}")

    # Gradient boosting with selected features
    model_gb_f = GradientBoostingRegressor(random_state=42)
    model_gb_f.fit(X_train_selected, y_train)
    y_pred_gb_f = model_gb_f.predict(X_test_selected)
    mse_gb_f = mean_squared_error(y_test, y_pred_gb_f)
    rmse_gb_f = np.sqrt(mse_gb_f)
    r2_gb_f = r2_score(y_test, y_pred_gb_f)
    print(f"Gradient Boosting (with Feature Selection) - MSE: {mse_gb_f}, RMSE: {rmse_gb_f}, R2: {r2_gb_f}")


Linear Regression (with Feature Selection) - MSE: 85141935874.85558, RMSE: 291790.9112272957, R2: 0.7057605497968791
Ridge Regression (with Feature Selection) - MSE: 85482968455.69069, RMSE: 292374.7055675143, R2: 0.7045819855787274
Decision Tree (with Feature Selection) - MSE: 108780282858.55693, RMSE: 329818.5605125293, R2: 0.6240694988626125
Random Forest (with Feature Selection) - MSE: 63768002117.929695, MSE: 252523.27044834837, R2: 0.7796260833050631
Gradient Boosting (with Feature Selection) - MSE: 82719466719.57874, RMSE: 287609.9211077023, R2: 0.7141322879428188
Linear Regression (with Feature Selection) - MSE: 28661108267.063, RMSE: 169295.91922743738, R2: 0.7329174743521125
Ridge Regression (with Feature Selection) - MSE: 28569276735.256866, RMSE: 169024.4856086149, R2: 0.7337732192598236
Decision Tree (with Feature Selection) - MSE: 29824963054.125847, RMSE: 172699.05342567991, R2: 0.7220719315657107
Random Forest (with Feature Selection) - MSE: 19118491275.74569, MSE: 1382

Hyper Parameter Tunning

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV

# Example hyperparameter grid for Linear Regression (you'll need to define similar grids for other models)
param_grid_lr = {} # Linear regression has no hyperparameters to tune

# Example hyperparameter grid for Ridge Regression
param_grid_ridge = {'alpha': [0.1, 1.0, 0.5, 0.6, 0.7, 0.8, 0.9],
                    'solver': ['svd','auto']
                    }
models = {
    'Linear Regression': (LinearRegression(), param_grid_lr),
    'Ridge Regression': (Ridge(), param_grid_ridge)
}

for df in df_seperated:
    X = df.drop(columns=['Price'])
    y = df['Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature Selection
    selector = SelectKBest(score_func=f_regression, k=550)  # Select top 550 features
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    for model_name, (model, param_grid) in models.items():
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')  # Use 5-fold cross-validation
        grid_search.fit(X_train_selected, y_train)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_selected)

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} (with Feature Selection) - Best Hyperparameters: {grid_search.best_params_}")
        print(f"{model_name} (with Feature Selection) - MSE: {mse}, RMSE: {rmse}, R2: {r2}")


Linear Regression (with Feature Selection) - Best Hyperparameters: {}
Linear Regression (with Feature Selection) - MSE: 85141935874.85558, RMSE: 291790.9112272957, R2: 0.7057605497968791
Ridge Regression (with Feature Selection) - Best Hyperparameters: {'alpha': 0.7, 'solver': 'svd'}
Ridge Regression (with Feature Selection) - MSE: 85328514935.97731, RMSE: 292110.44989177864, R2: 0.7051157568426225
Linear Regression (with Feature Selection) - Best Hyperparameters: {}
Linear Regression (with Feature Selection) - MSE: 28661108267.063, RMSE: 169295.91922743738, R2: 0.7329174743521125
Ridge Regression (with Feature Selection) - Best Hyperparameters: {'alpha': 0.5, 'solver': 'svd'}
Ridge Regression (with Feature Selection) - MSE: 28487202269.60788, RMSE: 168781.52229911863, R2: 0.7345380415888305
Linear Regression (with Feature Selection) - Best Hyperparameters: {}
Linear Regression (with Feature Selection) - MSE: 51148554315.06116, RMSE: 226160.4614318364, R2: 0.45373640798169357
Ridge Reg

In [None]:
# Example hyperparameter grid for RandomForestRegressor
param_grid_rf = {
    'n_estimators': [100, 500],
    'max_depth': [None, 10, 20]
}
models = {
    'Random Forest': (RandomForestRegressor(random_state=42), param_grid_rf)
}
for df in df_seperated:
    X = df.drop(columns=['Price'])
    y = df['Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature Selection
    selector = SelectKBest(score_func=f_regression, k=550)  # Select top 550 features
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    for model_name, (model, param_grid) in models.items():
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')  # Use 5-fold cross-validation
        grid_search.fit(X_train_selected, y_train)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_selected)

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} (with Feature Selection) - Best Hyperparameters: {grid_search.best_params_}")
        print(f"{model_name} (with Feature Selection) - MSE: {mse}, RMSE: {rmse}, R2: {r2}")


Random Forest (with Feature Selection) - Best Hyperparameters: {'max_depth': None, 'n_estimators': 500}
Random Forest (with Feature Selection) - MSE: 63872107269.46831, RMSE: 252729.3162050424, R2: 0.7792663094493564
Random Forest (with Feature Selection) - Best Hyperparameters: {'max_depth': None, 'n_estimators': 500}
Random Forest (with Feature Selection) - MSE: 19359089381.15603, RMSE: 139136.9447025341, R2: 0.819599631738449
Random Forest (with Feature Selection) - Best Hyperparameters: {'max_depth': 20, 'n_estimators': 500}
Random Forest (with Feature Selection) - MSE: 44754260478.16957, RMSE: 211552.02782807252, R2: 0.5220270951093331


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV

# Example hyperparameter grid for GradientBoostingRegressor
param_grid_gb = {
    'n_estimators': [100, 500],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]
}

models = {
    'Gradient Boosting': (GradientBoostingRegressor(random_state=42), param_grid_gb)
}

for df in df_seperated:
    X = df.drop(columns=['Price'])
    y = df['Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature Selection
    selector = SelectKBest(score_func=f_regression, k=550)  # Select top 550 features
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    for model_name, (model, param_grid) in models.items():
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')  # Use 5-fold cross-validation
        grid_search.fit(X_train_selected, y_train)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_selected)

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} (with Feature Selection) - Best Hyperparameters: {grid_search.best_params_}")
        print(f"{model_name} (with Feature Selection) - MSE: {mse}, RMSE: {rmse}, R2: {r2}")


Gradient Boosting (with Feature Selection) - Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 500}
Gradient Boosting (with Feature Selection) - MSE: 61525742515.03478, RMSE: 248043.83184234754, R2: 0.7873750406900374
Gradient Boosting (with Feature Selection) - Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 500}
Gradient Boosting (with Feature Selection) - MSE: 23773932089.01486, RMSE: 154187.97647357223, R2: 0.7784593056294222
Gradient Boosting (with Feature Selection) - Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 500}
Gradient Boosting (with Feature Selection) - MSE: 41892354569.22119, RMSE: 204676.21886584966, R2: 0.5525920841452052


Meta Regressor

In [None]:
# Meta regressor for the first subset

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression

# Define the base models
estimators = [
    ('lr', LinearRegression()),
    ('ridge', Ridge(alpha = 0.7, solver = 'svd')),
    ('gb', GradientBoostingRegressor(learning_rate = 0.1, max_depth = 5, min_samples_split = 5, n_estimators = 500, random_state=42)),
    ('rf', RandomForestRegressor(max_depth=None, n_estimators=500, random_state=42))
]

# Define the meta-learner
final_estimator = LinearRegression()

# Create the stacking regressor
meta_model = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=5)

X = df_preprocess_H.drop(columns=['Price'])
y = df_preprocess_H['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Selection
selector = SelectKBest(score_func=f_regression, k=550)  # Select top 550 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Train the meta-model
meta_model.fit(X_train_selected, y_train)
y_pred = meta_model.predict(X_test_selected)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Meta-Model - MSE: {mse}, RMSE: {rmse}, R2: {r2}")

Meta-Model - MSE: 57564890690.09752, RMSE: 239926.84445492446, R2: 0.8010632291406568


In [None]:
# Meta regressor for the second subset

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression

# Define the base models
estimators = [
    ('lr', LinearRegression()),
    ('ridge', Ridge(alpha = 0.5, solver = 'svd')),
    ('gb', GradientBoostingRegressor(learning_rate = 0.1, max_depth = 5, min_samples_split = 2, n_estimators = 500, random_state=42)),
    ('rf', RandomForestRegressor(max_depth=None, n_estimators=500, random_state=42))
]

# Define the meta-learner
final_estimator = LinearRegression()

# Create the stacking regressor
meta_model = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=5)

X = df_preprocess_R.drop(columns=['Price'])
y = df_preprocess_R['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Selection
selector = SelectKBest(score_func=f_regression, k=550)  # Select top 550 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Train the meta-model
meta_model.fit(X_train_selected, y_train)
y_pred = meta_model.predict(X_test_selected)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Meta-Model - MSE: {mse}, RMSE: {rmse}, R2: {r2}")

Meta-Model - MSE: 18224556339.427265, RMSE: 134998.35680269322, R2: 0.8301719357607646


In [None]:
# Meta regressor for the third subset

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression

# Define the base models
estimators = [
    ('lr', LinearRegression()),
    ('ridge', Ridge(alpha = 0.8, solver = 'auto')),
    ('gb', GradientBoostingRegressor(learning_rate = 0.1, max_depth = 5, min_samples_split = 5, n_estimators = 500, random_state=42)),
    ('rf', RandomForestRegressor(max_depth=20, n_estimators=500, random_state=42))
]

# Define the meta-learner
final_estimator = LinearRegression()

# Create the stacking regressor
meta_model = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=5)

X = df_preprocess_S.drop(columns=['Price'])
y = df_preprocess_S['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Selection
selector = SelectKBest(score_func=f_regression, k=550)  # Select top 550 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Train the meta-model
meta_model.fit(X_train_selected, y_train)
y_pred = meta_model.predict(X_test_selected)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Meta-Model - MSE: {mse}, RMSE: {rmse}, R2: {r2}")

Meta-Model - MSE: 41601216395.49256, RMSE: 203963.76245669858, R2: 0.5557014229463573
