In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# Load and clean data
df = pd.read_csv("Resources/st_paul_sold_properties .csv")
df.head()

Unnamed: 0,list_date,list_price,sold_date,sold_price,beds,baths,sqft,lot_sqft,city,street,zip,latitude,longitude
0,2025-01-31,209900.0,2025-03-31,209900,2.0,1.5,904.0,4704.0,Saint Paul,868 Algonquin Ave,55119,44.967562,-93.017718
1,2024-11-15,374900.0,2024-12-13,391000,3.0,1.5,1534.0,4966.0,Saint Paul,1246 Bayard Ave,55116,44.922167,-93.152861
2,2024-12-20,275000.0,2025-01-10,266800,3.0,1.0,1297.0,6534.0,Saint Paul,967 California Ave W,55117,44.991249,-93.140914
3,2025-01-23,500000.0,2025-02-28,505003,3.0,2.0,1636.0,5009.0,Saint Paul,2098 Pinehurst Ave,55116,44.918628,-93.189055
4,2025-03-18,425000.0,2025-04-08,475000,4.0,1.0,1750.0,7362.0,Saint Paul,2129 Sargent Ave,55105,44.935472,-93.190323


In [7]:
# Convert dates
df['list_date'] = pd.to_datetime(df['list_date'], errors='coerce')
df['sold_date'] = pd.to_datetime(df['sold_date'], errors='coerce')

# Drop rows with missing or invalid dates
df = df.dropna(subset=['list_date', 'sold_date'])

# Calculate days_on_market
df['days_on_market'] = (df['sold_date'] - df['list_date']).dt.days
df = df[df['days_on_market'] >= 0]
# Calculate price per sq ft, bed/bath ratio
df['price_per_sqft'] = df['list_price'] / df['sqft']
df['beds_baths_ratio'] = df['beds'] / (df['baths'] + 1)
df['month_listed'] = pd.to_datetime(df['list_date'], errors='coerce').dt.month
df['year_listed'] = pd.to_datetime(df['list_date'], errors='coerce').dt.year

# Drop rows with missing target
df = df.dropna(subset=['days_on_market'])

# Drop unneeded columns
df_model = df.drop(columns=['list_date', 'sold_date', 'city', 'street', 'latitude', 'longitude'], errors='ignore')

# Drop rows with any remaining NaNs
df_model = df_model.dropna()

In [8]:
# Features and target
X = df_model.drop(columns='days_on_market')
y = df_model['days_on_market']

# Apply log1p transformation to target
y_log = np.log1p(y)

In [9]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [10]:
# Define and train models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42)
}

results = {}

for name, model in models.items():
    # Pipeline: impute → scale → model
    pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    # Train model
    pipe.fit(X_train, y_train)
    y_pred_log = pipe.predict(X_test)
    
    # Inverse transform predictions
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_test)

    # Evaluate
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    results[name] = {'MAE': mae, 'R2': r2}

In [11]:
# Show results
results_df = pd.DataFrame(results).T.sort_values(by='MAE')
print("\n📈 Model Performance on days_on_market (log1p-transformed target):")
display(results_df)


📈 Model Performance on days_on_market (log1p-transformed target):


Unnamed: 0,MAE,R2
Random Forest,17.034757,0.652485
Ridge,21.040163,0.519246
Linear Regression,21.068461,0.517035
Lasso,28.682996,-0.067797


# Results
- Random Forest is clearly the best model here, with the lowest MAE (mean absolute error) of ~17.
- The highest R² score of ~0.65, meaning it explains about 65% of the variance in the log-transformed days_on_market.
- Ridge and Linear Regression perform decently, with R² around 0.52, but they are noticeably less accurate (MAE ~21).

# Optimizitation of Random Forest to get a R above 0.80

In [12]:
# Import Dependencies for Optimization
from sklearn.model_selection import GridSearchCV

In [13]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

# Preprocessing pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(random_state=42))
])

param_dist = {
    'model__n_estimators': randint(100, 500),
    'model__max_depth': [10, 20, 30, None],
    'model__min_samples_split': randint(2, 10),
    'model__min_samples_leaf': randint(1, 5)
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)


In [16]:
print(f"Best Parameters: {random_search.best_params_}")

Best Parameters: {'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 3, 'model__n_estimators': 443}


In [17]:
y_pred = random_search.best_estimator_.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Optimized Model R²: {r2:.4f}")


Optimized Model R²: 0.4826


In [15]:
# # Evaluate best Model
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"Best Parameters: {grid_search.best_params_}")
# print(f"MAE: {mae:.2f}")
# print(f"R²: {r2:.4f}")