In [1]:
# Import Dependencies
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
import pandas as pd

import pickle

In [2]:
# Load your dataset
df = pd.read_csv("../Resources/cleaned_st_paul_properties.csv")
df.head()

Unnamed: 0,list_date,list_price,sold_date,sold_price,beds,baths,sqft,lot_sqft,city,street,zip,latitude,longitude
0,1/23/2025,500000.0,2/28/2025,505003,3.0,2.0,1636.0,5009.0,Saint Paul,2098 Pinehurst Ave,55116,44.918628,-93.189055
1,9/3/2024,189900.0,12/11/2024,189900,2.0,1.0,832.0,4792.0,Saint Paul,536 Edmund Ave,55103,44.958192,-93.122838
2,12/20/2024,275000.0,1/10/2025,266800,3.0,1.0,1297.0,6534.0,Saint Paul,967 California Ave W,55117,44.991249,-93.140914
3,12/1/2024,324900.0,1/31/2025,325000,3.0,1.5,2185.0,6055.0,Saint Paul,627-629 Minnehaha Ave E,55130,44.963264,-93.074138
4,11/19/2024,214900.0,1/9/2025,210000,2.0,1.0,926.0,5001.0,Saint Paul,1631 Woodbridge St,55117,44.990397,-93.108765


In [3]:
# Adding Polynomial Interaction to increase R2 value
df['beds_baths_ratio'] = df['beds'] / (df['baths'] + 1)
df['lot_to_sqft_ratio'] = df ['lot_sqft'] / df['sqft']

In [4]:
# Define target and features
features = ['beds', 'baths', 'lot_sqft', 'sqft', 'beds_baths_ratio', 'lot_to_sqft_ratio']
target = 'sold_price'

# Drop rows with missing values for simplicity
df_model = df[features + [target]].dropna()

X = df_model[features]
y = df_model[target]

In [5]:
# Define column types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing pipeline
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [7]:
# Define the full pipeline with Gradient Boosting
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cross Validate Performance

In [10]:
# Define the model and parameters to tune
param_grid = {
    'regressor__n_estimators': [200, 300, 400],
    'regressor__learning_rate': [0.03, 0.05, 0.07],
    'regressor__max_depth': [4, 5, 6],
    'regressor__subsample': [0.8, 0.9, 1.0]
}

# Grid search
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Evaluate
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"✅ Best R² Score: {r2:.4f}")
print("🏗️ Best Parameters:", grid_search.best_params_)

✅ Best R² Score: 0.8284
🏗️ Best Parameters: {'regressor__learning_rate': 0.07, 'regressor__max_depth': 6, 'regressor__n_estimators': 200, 'regressor__subsample': 0.8}


# Detect and Prevent Overfitting

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search.fit(X_train, y_train)

# Evaluate on training
y_train_pred = grid_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)

# Evaluate on test
y_test_pred = grid_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)

print(f"R² on training data: {r2_train:.3f}")
print(f"R² on test data:     {r2_test:.3f}")

R² on training data: 0.985
R² on test data:     0.828


In [14]:
# Save model to disk
with open("sold_price_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Model saved as 'sold_price_model.pkl'")

Model saved as 'sold_price_model.pkl'


# Results and Analysis:
Ended up using Gradient Boosting Regressor to meet requirement of R2 greater than 0.80 and Optimization was a lot easier. 

- With Optimization of the Gradient Boosting Regressor the R2 value is 0.8284 which is greater than 0.80.
- We did perform test to see if the model was overfitting and it was slightly by the training data being 0.985 R2 versus test being 0.828.
- We saved the model to interact with the website. 