In [1]:
import numpy as np
import pandas as pd

In [15]:
df = pd.read_csv('cleaned_canada.csv')
df_preprocess = df.copy()

In [16]:
# Drop features with too many missing value
missing_thresh = 0.6
drop_cols = [col for col in df_preprocess.columns if df_preprocess[col].isnull().mean() > missing_thresh]
df_preprocess.drop(columns=drop_cols, inplace=True)
df_preprocess.drop(columns=['Acreage'], inplace=True)

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
df_25M = df_preprocess[df_preprocess['Price'] > 2500000].copy()
df_25M.drop(columns=['Latitude', 'Longitude'], inplace=True)
num_cols = df_25M.drop(columns=['Price']).select_dtypes(include=["float64", "int"]).columns.tolist()
cat_cols = df_25M.select_dtypes(include=["object"]).columns.tolist()
# deal with missing values
imputer = SimpleImputer(strategy='most_frequent')
df_25M[cat_cols] = imputer.fit_transform(df_25M[cat_cols])
# OneHot Coding
df_25M = pd.get_dummies(df_25M, columns=cat_cols, dummy_na=False)
# Standard scaling
scaler = StandardScaler()
df_25M[num_cols] = scaler.fit_transform(df_25M[num_cols])

In [18]:
from sklearn.model_selection import train_test_split
X_25M = df_25M.drop(columns=['Price'])
y_25M = df_25M['Price']
X_train_25M, X_test_25M, y_train_25M, y_test_25M = train_test_split(X_25M, y_25M, test_size=0.2, random_state=42)

In [19]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
model_lr_25M = LinearRegression()
model_lr_25M.fit(X_train_25M, y_train_25M)
y_pred_lr_25M = model_lr_25M.predict(X_test_25M)
mse_lr_25M = mean_squared_error(y_test_25M, y_pred_lr_25M)
rmse_lr_25M = np.sqrt(mse_lr_25M)
r2_lr_25M = r2_score(y_test_25M, y_pred_lr_25M)
norm_mse_lr_25M = mse_lr_25M / (max(y_test_25M) - min(y_test_25M))

print(f"Linear Regression (>2.5M) - MSE: {mse_lr_25M}, RMSE: {rmse_lr_25M}, R2: {r2_lr_25M}, NormalizedMSE: {norm_mse_lr_25M}")

Linear Regression (>2.5M) - MSE: 5900633253583.602, RMSE: 2429121.90998797, R2: 0.1793921035324575, NormalizedMSE: 223840.8574894709


In [20]:
# Ridge Regression
from sklearn.linear_model import Ridge
model_ridge_25M = Ridge(alpha=0.1)
model_ridge_25M.fit(X_train_25M, y_train_25M)
y_pred_ridge_25M = model_ridge_25M.predict(X_test_25M)
mse_ridge_25M = mean_squared_error(y_test_25M, y_pred_ridge_25M)
rmse_ridge_25M = np.sqrt(mse_ridge_25M)
r2_ridge_25M = r2_score(y_test_25M, y_pred_ridge_25M)
norm_mse_ridge_25M = mse_ridge_25M / (max(y_test_25M) - min(y_test_25M))

print(f"Ridge Regression (>2.5M) - MSE: {mse_ridge_25M}, RMSE: {rmse_ridge_25M}, R2: {r2_ridge_25M}, NormalizedMSE: {norm_mse_ridge_25M}")

Ridge Regression (>2.5M) - MSE: 5842983102444.309, RMSE: 2417226.3242080393, R2: 0.1874095767805568, NormalizedMSE: 221653.89573285272


In [None]:
# gradient boosting
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
model_gb_25M = GradientBoostingRegressor(random_state=42)
model_gb_25M.fit(X_train_25M, y_train_25M)
y_pred_gb_25M = model_gb_25M.predict(X_test_25M)
mse_gb_25M = mean_squared_error(y_test_25M, y_pred_gb_25M)
rmse_gb_25M = np.sqrt(mse_gb_25M)
r2_gb_25M = r2_score(y_test_25M, y_pred_gb_25M)
norm_mse_gb_25M = mse_gb_25M / (max(y_test_25M) - min(y_test_25M))

print(f"Gradient Boosting (>2.5M) - MSE: {mse_gb_25M}, RMSE: {rmse_gb_25M}, R2: {r2_gb_25M}, NormalizedMSE: {norm_mse_gb_25M}")

In [None]:
# Random forest
from sklearn.ensemble import RandomForestRegressor
model_rf_25M = RandomForestRegressor(n_estimators=500, random_state=42)
model_rf_25M.fit(X_train_25M, y_train_25M)
y_pred_rf_25M = model_rf_25M.predict(X_test_25M)

mse_rf_25M = mean_squared_error(y_test_25M, y_pred_rf_25M)
rmse_rf_25M = np.sqrt(mse_rf_25M)
r2_rf_25M = r2_score(y_test_25M, y_pred_rf_25M)
norm_mse_rf_25M = mse_rf_25M / (max(y_test_25M) - min(y_test_25M))

print(f"Random Forest (>2.5M) - MSE: {mse_rf_25M}, RMSE: {rmse_rf_25M}, R2: {r2_rf_25M}, NormalizedMSE: {norm_mse_rf_25M}")

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
model_dt_25M = DecisionTreeRegressor(random_state=42)
model_dt_25M.fit(X_train_25M, y_train_25M)
y_pred_dt_25M = model_dt_25M.predict(X_test_25M)
mse_dt_25M = mean_squared_error(y_test_25M, y_pred_dt_25M)
rmse_dt_25M = np.sqrt(mse_dt_25M)
r2_dt_25M = r2_score(y_test_25M, y_pred_dt_25M)
norm_mse_dt_25M = mse_dt_25M / (max(y_test_25M) - min(y_test_25M))

print(f"Decision Tree (>2.5M) - MSE: {mse_dt_25M}, RMSE: {rmse_dt_25M}, R2: {r2_dt_25M}, NormalizedMSE: {norm_mse_dt_25M}")

In [None]:
# hyperparameter grid for Ridge Regression
from sklearn.model_selection import GridSearchCV
param_grid_ridge = {'alpha': [0.1, 1.0, 0.5, 0.6, 0.7, 0.8, 0.9],
                    'solver': ['svd','auto']
                    }
models = {
    'Ridge Regression': (Ridge(), param_grid_ridge)
}
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')  # Use 5-fold cross-validation
    grid_search.fit(X_train_25M, y_train_25M)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_25M)
    mse = mean_squared_error(y_test_25M, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_25M, y_pred)
    print(f"{model_name} (with Feature Selection) - Best Hyperparameters: {grid_search.best_params_}")
    print(f"{model_name} (with Feature Selection) - MSE: {mse}, RMSE: {rmse}, R2: {r2}")

In [None]:
# Hyper parameter tunning for rf
param_grid_rf = {
    'n_estimators': [100, 500],
    'max_depth': [None, 10, 20]
}
models = {
    'Random Forest': (RandomForestRegressor(random_state=42), param_grid_rf)
}
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')  # Use 5-fold cross-validation
    grid_search.fit(X_train_25M, y_train_25M)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_25M)
    mse = mean_squared_error(y_test_25M, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_25M, y_pred)
    print(f"{model_name} (with Feature Selection) - Best Hyperparameters: {grid_search.best_params_}")
    print(f"{model_name} (with Feature Selection) - MSE: {mse}, RMSE: {rmse}, R2: {r2}")

In [None]:
# Hyper parameter tunning for gradient boosting
param_grid_gb = {
    'n_estimators': [100, 500],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]
}

models = {
    'Gradient Boosting': (GradientBoostingRegressor(random_state=42), param_grid_gb)
}
for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')  # Use 5-fold cross-validation
    grid_search.fit(X_train_25M, y_train_25M)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_25M)
    mse = mean_squared_error(y_test_25M, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_25M, y_pred)
    print(f"{model_name} (with Feature Selection) - Best Hyperparameters: {grid_search.best_params_}")
    print(f"{model_name} (with Feature Selection) - MSE: {mse}, RMSE: {rmse}, R2: {r2}")

In [None]:
# Meta regressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
estimators = [
    ('lr', LinearRegression()),
    ('ridge', Ridge(alpha = 1.0, solver = 'auto')),
    ('gb', GradientBoostingRegressor(learning_rate = 0.1, max_depth = 3, min_samples_split = 2, n_estimators = 100, random_state=42)),
    ('rf', RandomForestRegressor(max_depth=10, n_estimators=100, random_state=42))
]

# Define the meta-learner
final_estimator = LinearRegression()

# Create the stacking regressor
meta_model = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=5)
meta_model.fit(X_train_25M, y_train_25M)
y_pred_25M = meta_model.predict(X_test_25M)
mse = mean_squared_error(y_test_25M, y_pred_25M)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_25M, y_pred_25M)

print(f"Meta-Model - MSE: {mse}, RMSE: {rmse}, R2: {r2}")