In [1]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [2]:
# Load data
file = "listings_with_locations.csv"

file_path = Path("data/" + file)
listings_df = pd.read_csv(file_path, index_col="mls_id", header=0)


listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034


In [3]:
# Remove luxury listings with more than 5 bathrooms or more than 4 beds 
listings_df = listings_df[(listings_df['baths'] != 0) & (listings_df['baths'] <= 5) & (listings_df['beds'] <= 4) & (listings_df['beds'] != 0) & (listings_df['dens'] <= 2)]

# Remove rows with missing latitude values
listings_df = listings_df.dropna(subset=["latitude"])

In [4]:
# Function to remove outliers (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for each property type

for i in range(4):
    cleaned_df = pd.DataFrame()
    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])
    listings_df = cleaned_df

In [5]:
listings_df.count()

url              3658
address          3658
price            3658
baths            3658
beds             3658
dens             3658
street           3658
neighbourhood    3658
city             3658
property_type    3658
date_scraped     3658
latitude         3658
longitude        3658
dtype: int64

In [6]:
# Feature engineering
listings_df['rel_latitude'] = listings_df['latitude'] - 43
listings_df['rel_longitude'] = listings_df['longitude'] + 79

listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034


In [7]:
listings_df.nunique()

url              3658
address          3654
price            1088
baths               5
beds                4
dens                3
street           1141
neighbourhood     143
city                1
property_type       4
date_scraped        1
latitude         2078
longitude        2079
rel_latitude     2078
rel_longitude    2079
dtype: int64

In [8]:
listings_df['neighbourhood'].value_counts()

Waterfront Communities C1            332
Church-Yonge Corridor                189
Mimico                               153
Islington-City Centre West           118
Willowdale East                      115
                                    ... 
Lambton Baby Point                     3
Pleasant View                          2
Bridle Path-Sunnybrook-York Mills      2
Forest Hill South                      1
Playter Estates-Danforth               1
Name: neighbourhood, Length: 143, dtype: int64

In [9]:
# Prepare data for training
cat_columns = ["neighbourhood", "property_type"]
num_columns = ["baths", "beds", "dens", "rel_latitude", "rel_longitude"]
y = listings_df["price"]

In [10]:
listings_df_prepared = pd.get_dummies(listings_df[cat_columns], dtype=int)
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Woodbine-Lumsden,neighbourhood_Wychwood,neighbourhood_Yonge-Eglinton,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [11]:
listings_df_prepared[num_columns] = listings_df[num_columns]
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome,baths,beds,dens,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0.687469,-0.301861
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0.650343,-0.387806
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0.622946,-0.481658
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,2,0,0.658361,-0.351159
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0.665267,-0.341034


In [12]:
X = listings_df_prepared
y = listings_df["price"]

In [13]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# Create a pipeline with StandardScaler and RandomForestRegressor
rf_pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=500, random_state=1))

In [15]:
# Train the model and evaluate it using cross-validation
cv_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

print("Cross-Validation MAE Scores:", -cv_scores)
print("Mean CV MAE:", -cv_scores.mean())

Cross-Validation MAE Scores: [105309.29443354 103889.95290712 102980.0790088  113227.96274505
 105141.73815739 108332.26017879 121621.64615524 117378.02015854
 128127.18344849 120376.79284492]
Mean CV MAE: 112638.49300378762


In [16]:
# Fit the model on the training set
rf_pipeline.fit(X_train, y_train)

# Make predictions on the test set
predictions_test = rf_pipeline.predict(X_test)

In [17]:
# Evaluate the model on the test set
rf_mae_test = mean_absolute_error(y_test, predictions_test)
print("Random Forest Test MAE:", rf_mae_test)

Random Forest Test MAE: 107851.1773061482


In [18]:
# Make predictions on the training set
predictions_train = rf_pipeline.predict(X_train)

In [19]:
test_data = {"Actual": y_test,
        "Predicted": predictions_test}
test_df = pd.DataFrame(test_data)

test_df["ratio"] = abs (test_df["Actual"] - test_df["Predicted"]) / test_df["Actual"]
test_df["Difference"] = abs(test_df["Actual"] - test_df["Predicted"])


train_data = {"Actual": y_train,
        "Predicted": predictions_train}
train_df = pd.DataFrame(train_data)

train_df["ratio"] = abs (train_df["Actual"] - train_df["Predicted"]) / train_df["Actual"]
train_df["Difference"] = abs(train_df["Actual"] - train_df["Predicted"])

comparison_df = pd.concat([test_df, train_df], axis = 0)

og_comparison_df = pd.concat([comparison_df, listings_df], axis = 1)

og_comparison_df.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
E7392872,525000,504187.7,0.039643,20812.315333,https://toronto.listing.ca/3311-kingston-rd-14...,3311 Kingston Rd 1409,525000,1,2,0,Kingston Rd,Scarborough Village,Toronto,condo_apartment,2024-01-30,43.73332,-79.222392,0.73332,-0.222392
C7403516,649000,679001.4,0.046227,30001.355733,https://toronto.listing.ca/260-seneca-hill-dr-...,260 Seneca Hill Dr 912,649000,1,2,0,Seneca Hill Dr,Don Valley Village,Toronto,condo_apartment,2024-01-30,43.790149,-79.357271,0.790149,-0.357271
W7398652,1235000,1334841.0,0.080843,99841.086,https://toronto.listing.ca/80-amesbury-dr.W739...,80 Amesbury Dr,1235000,3,3,1,Amesbury Dr,Brookhaven-Amesbury,Toronto,detached_home,2024-01-30,43.705257,-79.482238,0.705257,-0.482238
E7345710,1198800,1098083.0,0.084015,100717.432,https://toronto.listing.ca/15-stonehenge-cres....,15 Stonehenge Cres,1198800,2,3,2,Stonehenge Cres,Woburn,Toronto,detached_home,2024-01-30,43.770728,-79.226503,0.770728,-0.226503
E7266274,674900,594227.2,0.119533,80672.775905,https://toronto.listing.ca/4727-sheppard-ave-1...,4727 Sheppard Ave 1501,674900,2,2,1,Sheppard Ave,Agincourt South-Malvern West,Toronto,condo_apartment,2024-01-30,43.791598,-79.248969,0.791598,-0.248969


In [20]:
sorted_comparison = og_comparison_df.sort_values(by="ratio", ascending=False)
sorted_comparison.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
W7301712,1149000,2453916.0,1.135697,1304916.0,https://toronto.listing.ca/4-monaco-crt.W73017...,4 Monaco Crt,1149000,5,4,1,Monaco Crt,Black Creek,Toronto,detached_home,2024-01-30,43.765226,-79.528315,0.765226,-0.528315
W7250092,439900,813121.1,0.848423,373221.1,https://toronto.listing.ca/320-dixon-rd-115.W7...,320 Dixon Rd 115,439900,2,3,0,Dixon Rd,Kingsview Village-The Westway,Toronto,condo_apartment,2024-01-30,43.696811,-79.552502,0.696811,-0.552502
W7374132,949999,1749320.0,0.841391,799320.6,https://toronto.listing.ca/81-woolenscote-circ...,81 Woolenscote Circ,949999,4,3,2,Woolenscote Circ,West Humber-Clairville,Toronto,detached_home,2024-01-30,43.732734,-79.602049,0.732734,-0.602049
W7380994,1325000,2415641.0,0.823125,1090641.0,https://toronto.listing.ca/27-craydon-ave.W738...,27 Craydon Ave,1325000,5,3,0,Craydon Ave,Mount Dennis,Toronto,detached_home,2024-01-30,43.690797,-79.497534,0.690797,-0.497534
E8028440,974888,1744762.0,0.789705,769873.5,https://toronto.listing.ca/20-murmouth-rd.E802...,20 Murmouth Rd,974888,2,3,0,Murmouth Rd,Tam O'Shanter-Sullivan,Toronto,detached_home,2024-01-30,43.777314,-79.294474,0.777314,-0.294474


In [21]:
# Neighbourhood ratios (margin of error)
neigh_comparison = og_comparison_df.groupby("neighbourhood")

neigh_comparison["ratio"].mean().sort_values(ascending=False)

neighbourhood
Corso Italia-Davenport      0.244090
Mount Dennis                0.171992
Black Creek                 0.162954
Playter Estates-Danforth    0.161109
Pleasant View               0.160248
                              ...   
Bendale                     0.032421
Dorset Park                 0.031512
Kensington-Chinatown        0.031254
Bayview Woods-Steeles       0.030776
Ionview                     0.015568
Name: ratio, Length: 143, dtype: float64

In [22]:
sorted_comparison.loc[sorted_comparison["neighbourhood"] == "Corso Italia-Davenport"]

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
W8028100,995000,1382025.696,0.388971,387025.696,https://toronto.listing.ca/1240-lansdowne-ave....,1240 Lansdowne Ave,995000,3,4,0,Lansdowne Ave,Corso Italia-Davenport,Toronto,freehold _townhome,2024-01-30,43.673685,-79.449759,0.673685,-0.449759
W8031028,849000,1056897.146,0.244873,207897.146,https://toronto.listing.ca/146-ascot-ave.W8031...,146 Ascot Ave,849000,1,2,0,Ascot Ave,Corso Italia-Davenport,Toronto,detached_home,2024-01-30,43.678885,-79.450059,0.678885,-0.450059
W7390820,1899000,1712086.728,0.098427,186913.272,https://toronto.listing.ca/127-greenlaw-ave.W7...,127 Greenlaw Ave,1899000,3,3,1,Greenlaw Ave,Corso Italia-Davenport,Toronto,detached_home,2024-01-30,43.675711,-79.447539,0.675711,-0.447539


In [23]:
sorted_comparison[["price", "beds", "dens"]].max()

price    3275000
beds           4
dens           2
dtype: int64