In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [2]:
# Load data
file = "all_listings_2024-02-05.csv"

file_path = Path("../data/sqft/" + file)
price_sqft_df = pd.read_csv(file_path, index_col="mls_id", header=0)


price_sqft_df.head()

Unnamed: 0_level_0,url,price,address,beds,dens,baths,sqft,property_type,street,neighbourhood,city,date_scraped
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C7326020,https://torontocondoteam.ca/5106-1-bloor-st-e-...,1699000,5106 - 1 Bloor St E,2,1,3,1000-1199,condo_apartment,Bloor St E,Church-Yonge Corridor,Toronto,2024-02-05
C7362210,https://torontocondoteam.ca/1604-181-huron-st-...,743000,1604 - 181 Huron St,1,0,1,0-499,condo_apartment,Huron St,Kensington-Chinatown,Toronto,2024-02-05
C7251458,https://torontocondoteam.ca/5109-14-york-st-c7...,698888,5109 - 14 York St,1,0,1,500-599,condo_apartment,York St,Waterfront Communities C1,Toronto,2024-02-05
C6792974,https://torontocondoteam.ca/3404-77-harbour-sq...,669900,3404 - 77 Harbour Sq,1,1,1,600-699,condo_apartment,Harbour Sq,Waterfront Communities C1,Toronto,2024-02-05
C6802076,https://torontocondoteam.ca/906-51-trolley-cre...,575000,906 - 51 Trolley Cres,1,0,1,500-599,condo_apartment,Trolley Cres,Moss Park,Toronto,2024-02-05


In [3]:
# Extract max sqft
price_sqft_df['sqft'] = price_sqft_df['sqft'].str.extract(r'(\d+)-(\d+)').astype(float).apply(lambda x: np.nanmax(x), axis=1)

# Convert NaN to 0 
price_sqft_df['sqft'] = price_sqft_df['sqft'].fillna(0)

# Convert sqft to integer 
price_sqft_df['sqft'] = price_sqft_df['sqft'].astype(int)

price_sqft_df.head()

  price_sqft_df['sqft'] = price_sqft_df['sqft'].str.extract(r'(\d+)-(\d+)').astype(float).apply(lambda x: np.nanmax(x), axis=1)


Unnamed: 0_level_0,url,price,address,beds,dens,baths,sqft,property_type,street,neighbourhood,city,date_scraped
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C7326020,https://torontocondoteam.ca/5106-1-bloor-st-e-...,1699000,5106 - 1 Bloor St E,2,1,3,1199,condo_apartment,Bloor St E,Church-Yonge Corridor,Toronto,2024-02-05
C7362210,https://torontocondoteam.ca/1604-181-huron-st-...,743000,1604 - 181 Huron St,1,0,1,499,condo_apartment,Huron St,Kensington-Chinatown,Toronto,2024-02-05
C7251458,https://torontocondoteam.ca/5109-14-york-st-c7...,698888,5109 - 14 York St,1,0,1,599,condo_apartment,York St,Waterfront Communities C1,Toronto,2024-02-05
C6792974,https://torontocondoteam.ca/3404-77-harbour-sq...,669900,3404 - 77 Harbour Sq,1,1,1,699,condo_apartment,Harbour Sq,Waterfront Communities C1,Toronto,2024-02-05
C6802076,https://torontocondoteam.ca/906-51-trolley-cre...,575000,906 - 51 Trolley Cres,1,0,1,599,condo_apartment,Trolley Cres,Moss Park,Toronto,2024-02-05


In [4]:
price_sqft_df.count()

url              2543
price            2543
address          2543
beds             2543
dens             2543
baths            2543
sqft             2543
property_type    2543
street           2543
neighbourhood    2543
city             2543
date_scraped     2543
dtype: int64

In [5]:
# Create price per sqft
def create_price_per_sqft(listings_df):
    listings_df['price_per_sqft'] = listings_df['price'] / listings_df['sqft']
    listings_df['price_per_sqft'] = listings_df['price_per_sqft'].round(0)
    return listings_df  

listings_df = create_price_per_sqft(price_sqft_df)
listings_df.head()

Unnamed: 0_level_0,url,price,address,beds,dens,baths,sqft,property_type,street,neighbourhood,city,date_scraped,price_per_sqft
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C7326020,https://torontocondoteam.ca/5106-1-bloor-st-e-...,1699000,5106 - 1 Bloor St E,2,1,3,1199,condo_apartment,Bloor St E,Church-Yonge Corridor,Toronto,2024-02-05,1417.0
C7362210,https://torontocondoteam.ca/1604-181-huron-st-...,743000,1604 - 181 Huron St,1,0,1,499,condo_apartment,Huron St,Kensington-Chinatown,Toronto,2024-02-05,1489.0
C7251458,https://torontocondoteam.ca/5109-14-york-st-c7...,698888,5109 - 14 York St,1,0,1,599,condo_apartment,York St,Waterfront Communities C1,Toronto,2024-02-05,1167.0
C6792974,https://torontocondoteam.ca/3404-77-harbour-sq...,669900,3404 - 77 Harbour Sq,1,1,1,699,condo_apartment,Harbour Sq,Waterfront Communities C1,Toronto,2024-02-05,958.0
C6802076,https://torontocondoteam.ca/906-51-trolley-cre...,575000,906 - 51 Trolley Cres,1,0,1,599,condo_apartment,Trolley Cres,Moss Park,Toronto,2024-02-05,960.0


In [6]:
# Remove luxury listings with more than 5 bathrooms or more than 4 beds 
listings_df = listings_df[(listings_df['baths'] != 0) & (listings_df['baths'] <= 5) & (listings_df['beds'] <= 4) & (listings_df['beds'] != 0) & (listings_df['dens'] <= 2)]


In [7]:
# Function to remove outliers (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for each property type

for i in range(4):
    cleaned_df = pd.DataFrame()
    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])
    listings_df = cleaned_df

In [8]:
listings_df.count()

url               1849
price             1849
address           1849
beds              1849
dens              1849
baths             1849
sqft              1849
property_type     1849
street            1849
neighbourhood     1849
city              1849
date_scraped      1849
price_per_sqft    1849
dtype: int64

In [9]:
listings_df['neighbourhood'].value_counts()

Waterfront Communities C1        187
Church-Yonge Corridor             93
Mimico                            84
Willowdale East                   67
Niagara                           61
                                ... 
Trinity-Bellwoods                  1
Blake-Jones                        1
Thistletown-Beaumonde Heights      1
Danforth                           1
Rustic                             1
Name: neighbourhood, Length: 134, dtype: int64

In [10]:
# Prepare data for training
cat_columns = ["neighbourhood", "property_type"]
num_columns = ["baths", "beds", "dens", "price_per_sqft", "sqft"]
y = listings_df["price"]

In [11]:
listings_df_prepared = pd.get_dummies(listings_df[cat_columns], dtype=int)
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Willowridge-Martingrove-Richview,neighbourhood_Woburn,neighbourhood_Woodbine Corridor,neighbourhood_Woodbine-Lumsden,neighbourhood_Wychwood,neighbourhood_Yonge-Eglinton,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C7362210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
C7251458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
C6792974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
C6802076,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
W7358808,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
listings_df_prepared[num_columns] = listings_df[num_columns]
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Yonge-Eglinton,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,baths,beds,dens,price_per_sqft,sqft
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C7362210,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,1489.0,499
C7251458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,1167.0,599
C6792974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,958.0,699
C6802076,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,960.0,599
W7358808,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,2,2,0,876.0,799


In [13]:
X = listings_df_prepared
y = listings_df["price"]

In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [20]:
# Remove infinite values
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_train = X_train.dropna()

# Remove corresponding rows from y_train
y_train = y_train.loc[X_train.index]

In [21]:
# Create a pipeline with StandardScaler and RandomForestRegressor
rf_pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=500, random_state=1))

In [22]:
# Scale features 
X_train_scaled = StandardScaler().fit_transform(X_train)

In [23]:
# Train the model and evaluate it using cross-validation
cv_scores = cross_val_score(rf_pipeline, X_train_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')

print("Cross-Validation MAE Scores:", -cv_scores)
print("Mean CV MAE:", -cv_scores.mean())

Cross-Validation MAE Scores: [ 7929.42226532 10913.05833099 13674.74699652 13829.57580855
 13675.36264067]
Mean CV MAE: 12004.433208410284


In [25]:
# Remove infinite values from X_test
X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.dropna()

# Remove corresponding rows from y_test
y_test = y_test.loc[X_test.index]

In [28]:
# Fit the StandardScaler on the training data
rf_pipeline.fit(X_train, y_train)

# Scale features for the test set using the fitted StandardScaler
X_test_scaled = rf_pipeline.named_steps['standardscaler'].transform(X_test)

# Now, you can make predictions on the scaled test set
predictions_test = rf_pipeline.predict(X_test_scaled)



In [29]:
# Evaluate the model on the test set
rf_mae_test = mean_absolute_error(y_test, predictions_test)
print("Random Forest Test MAE:", rf_mae_test)

Random Forest Test MAE: 460484.6829831326


In [30]:
# Make predictions on the training set
predictions_train = rf_pipeline.predict(X_train)

In [31]:
test_data = {"Actual": y_test,
        "Predicted": predictions_test}
test_df = pd.DataFrame(test_data)

test_df["ratio"] = abs (test_df["Actual"] - test_df["Predicted"]) / test_df["Actual"]
test_df["Difference"] = abs(test_df["Actual"] - test_df["Predicted"])


train_data = {"Actual": y_train,
        "Predicted": predictions_train}
train_df = pd.DataFrame(train_data)

train_df["ratio"] = abs (train_df["Actual"] - train_df["Predicted"]) / train_df["Actual"]
train_df["Difference"] = abs(train_df["Actual"] - train_df["Predicted"])

comparison_df = pd.concat([test_df, train_df], axis = 0)

og_comparison_df = pd.concat([comparison_df, listings_df], axis = 1)

og_comparison_df.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,price,address,beds,dens,baths,sqft,property_type,street,neighbourhood,city,date_scraped,price_per_sqft
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
C7018306,768000.0,270057.6,0.648363,497942.4,https://torontocondoteam.ca/3201-426-universit...,768000,3201 - 426 University Ave,1,1,1,799,condo_apartment,University Ave,University,Toronto,2024-02-05,961.0
C7362182,938000.0,269937.4,0.71222,668062.6,https://torontocondoteam.ca/1804-83-redpath-av...,938000,1804 - 83 Redpath Ave,2,0,2,999,condo_apartment,Redpath Ave,Mount Pleasant West,Toronto,2024-02-05,939.0
C7002864,649900.0,269937.4,0.584648,379962.6,https://torontocondoteam.ca/lph06-16-harrison-...,649900,Lph06 - 16 Harrison Garden Blvd,1,0,1,599,condo_apartment,Lph06 - 16 Harrison Garden Blvd,Willowdale East,Toronto,2024-02-05,1085.0
W7034258,949000.0,269937.4,0.715556,679062.6,https://torontocondoteam.ca/2110-1-aberfoyle-c...,949000,2110 - 1 Aberfoyle Cres,2,0,2,1399,condo_apartment,Aberfoyle Cres,Islington-City Centre West,Toronto,2024-02-05,678.0
C7233376,728000.0,270460.4,0.628488,457539.6,https://torontocondoteam.ca/4205-85-queens-wha...,728000,4205 - 85 Queens Wharf Rd,1,1,1,699,condo_apartment,Queens Wharf Rd,Waterfront Communities C1,Toronto,2024-02-05,1041.0


In [32]:
sorted_comparison = og_comparison_df.sort_values(by="ratio", ascending=False)
sorted_comparison.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,price,address,beds,dens,baths,sqft,property_type,street,neighbourhood,city,date_scraped,price_per_sqft
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
E7386944,1299000.0,269937.4,0.792196,1029062.6,https://torontocondoteam.ca/301-35-boardwalk-d...,1299000,301 - 35 Boardwalk Dr,2,0,2,1399,condo_apartment,Boardwalk Dr,The Beaches,Toronto,2024-02-05,929.0
E7211614,1299000.0,269937.4,0.792196,1029062.6,https://torontocondoteam.ca/104-14-dewhurst-bl...,1299000,104 - 14 Dewhurst Blvd,2,0,2,1199,condo_apartment,Dewhurst Blvd,Danforth,Toronto,2024-02-05,1083.0
C7389668,1249900.0,269937.4,0.784033,979962.6,https://torontocondoteam.ca/707-200-sudbury-st...,1249900,707 - 200 Sudbury St,2,0,2,899,condo_apartment,Sudbury St,Little Portugal,Toronto,2024-02-05,1390.0
W7214952,1249999.0,270057.6,0.783954,979941.4,https://torontocondoteam.ca/2507-2121-lake-sho...,1249999,2507 - 2121 Lake Shore Blvd W,2,1,2,1199,condo_apartment,Lake Shore Blvd W,Mimico,Toronto,2024-02-05,1043.0
C6787140,1289000.0,281405.398,0.781687,1007594.602,https://torontocondoteam.ca/303-188-spadina-av...,1289000,303 - 188 Spadina Ave,3,0,2,1799,condo_apartment,Spadina Ave,Kensington-Chinatown,Toronto,2024-02-05,717.0


In [33]:
# Neighbourhood ratios (margin of error)
neigh_comparison = og_comparison_df.groupby("neighbourhood")

neigh_comparison["ratio"].mean().sort_values(ascending=False)

neighbourhood
Danforth                         0.792196
Humberlea-Pelmo Park W4          0.741192
Humber Summit                    0.550029
Guildwood                        0.535620
Trinity-Bellwoods                0.532980
                                   ...   
Lambton Baby Point                    NaN
Lawrence Park South                   NaN
Oakridge                              NaN
Runnymede-Bloor West Village          NaN
Thistletown-Beaumonde Heights         NaN
Name: ratio, Length: 134, dtype: float64

In [37]:
sorted_comparison.loc[sorted_comparison["neighbourhood"] == "Danforth"]

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,price,address,beds,dens,baths,sqft,property_type,street,neighbourhood,city,date_scraped,price_per_sqft
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
E7211614,1299000.0,269937.4,0.792196,1029062.6,https://torontocondoteam.ca/104-14-dewhurst-bl...,1299000,104 - 14 Dewhurst Blvd,2,0,2,1199,condo_apartment,Dewhurst Blvd,Danforth,Toronto,2024-02-05,1083.0


In [38]:
sorted_comparison[["price", "beds", "dens"]].max()

price    1300000
beds           4
dens           2
dtype: int64