In [527]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

In [528]:
# Load data
file = "listings_with_locations.csv"

file_path = Path("../data/" + file)
listings_df = pd.read_csv(file_path, index_col="mls_id", header=0)


listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034


In [529]:
# Remove luxury listings with more than 5 bathrooms or more than 4 beds 
listings_df = listings_df[(listings_df['baths'] != 0) & (listings_df['baths'] <= 5) & (listings_df['beds'] <= 4) & (listings_df['beds'] != 0) & (listings_df['dens'] <= 2)  & (listings_df['price'] <= 2000000)]

# Remove rows with missing latitude values
listings_df = listings_df.dropna(subset=["latitude"])

In [530]:
# Function to remove outliers (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for each property type

for i in range(4):
    cleaned_df = pd.DataFrame()
    for property_type in listings_df['neighbourhood'].unique():
        subset_df = listings_df[listings_df['neighbourhood'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])
    listings_df = cleaned_df

    cleaned_df = pd.DataFrame()
    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])
    listings_df = cleaned_df

In [531]:
listings_df = listings_df[listings_df.groupby('neighbourhood').city.transform('count')>10].copy() 

In [532]:
listings_df.count()

url              2862
address          2862
price            2862
baths            2862
beds             2862
dens             2862
street           2862
neighbourhood    2862
city             2862
property_type    2862
date_scraped     2862
latitude         2862
longitude        2862
dtype: int64

In [533]:
# Feature engineering
listings_df['rel_latitude'] = listings_df['latitude'] - 43
listings_df['rel_longitude'] = listings_df['longitude'] + 79

listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806
C8030612,https://toronto.listing.ca/35-mercer-st-2810.C...,35 Mercer St 2810,1039000,2,2,0,Mercer St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.645471,-79.391166,0.645471,-0.391166
C8030426,https://toronto.listing.ca/55-mercer-st-710.C8...,55 Mercer St 710,999000,2,1,1,Mercer St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.645397,-79.391362,0.645397,-0.391362
C8030300,https://toronto.listing.ca/35-mercer-st-3711.C...,35 Mercer St 3711,1037000,2,2,0,Mercer St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.645471,-79.391166,0.645471,-0.391166
C8029936,https://toronto.listing.ca/608-richmond-st-100...,608 Richmond St 1001,665000,1,1,1,Richmond St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.646841,-79.402888,0.646841,-0.402888


In [534]:
listings_df.nunique()

url              2862
address          2859
price             825
baths               5
beds                4
dens                3
street            699
neighbourhood      80
city                1
property_type       4
date_scraped        1
latitude         1402
longitude        1403
rel_latitude     1402
rel_longitude    1403
dtype: int64

In [535]:
listings_df['neighbourhood'].value_counts()

neighbourhood
Waterfront Communities C1     327
Church-Yonge Corridor         187
Mimico                        149
Islington-City Centre West    101
Bay Street Corridor           100
                             ... 
Edenbridge-Humber Valley       12
Humewood-Cedarvale             12
Greenwood-Coxwell              12
Yonge-Eglinton                 11
North St. James Town           11
Name: count, Length: 80, dtype: int64

In [536]:
# Prepare data for training
cat_columns = ["neighbourhood", "property_type"]
num_columns = ["baths", "beds", "dens", "rel_latitude", "rel_longitude"]
y = listings_df["price"]

In [537]:
listings_df_prepared = pd.get_dummies(listings_df[cat_columns], dtype=int)
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bendale,neighbourhood_Birchcliffe-Cliffside,neighbourhood_Black Creek,neighbourhood_Brookhaven-Amesbury,...,neighbourhood_Willowdale East,neighbourhood_Willowdale West,neighbourhood_Woburn,neighbourhood_Yonge-Eglinton,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C8030612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C8030426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C8030300,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C8029936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [538]:
listings_df_prepared[num_columns] = listings_df[num_columns]
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bendale,neighbourhood_Birchcliffe-Cliffside,neighbourhood_Black Creek,neighbourhood_Brookhaven-Amesbury,...,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome,baths,beds,dens,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0.650343,-0.387806
C8030612,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,2,0,0.645471,-0.391166
C8030426,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,1,1,0.645397,-0.391362
C8030300,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,2,0,0.645471,-0.391166
C8029936,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0.646841,-0.402888


In [539]:
X = listings_df_prepared
y = listings_df["price"]

In [540]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [541]:
# Create a pipeline with StandardScaler and RandomForestRegressor
rf_pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=500, random_state=1))

In [542]:
# Train the model and evaluate it using cross-validation
cv_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

print("Cross-Validation MAE Scores:", -cv_scores)
print("Mean CV MAE:", -cv_scores.mean())

Cross-Validation MAE Scores: [77892.86281319 82105.46965541 74215.74633477 79599.92711099
 75516.80283487]
Mean CV MAE: 77866.16174984432


In [543]:
# Fit the model on the training set
rf_pipeline.fit(X_train, y_train)

# Make predictions on the test set
predictions_test = rf_pipeline.predict(X_test)

In [544]:
# Evaluate the model on the test set
rf_mae_test = mean_absolute_error(y_test, predictions_test)
print("Random Forest Test MAE:", rf_mae_test)

Random Forest Test MAE: 75975.01441537018


In [545]:
# Make predictions on the training set
predictions_train = rf_pipeline.predict(X_train)

In [546]:
test_data = {"Actual": y_test,
        "Predicted": predictions_test}
test_df = pd.DataFrame(test_data)

test_df["ratio"] = abs (test_df["Actual"] - test_df["Predicted"]) / test_df["Actual"]
test_df["Difference"] = abs(test_df["Actual"] - test_df["Predicted"])


train_data = {"Actual": y_train,
        "Predicted": predictions_train}
train_df = pd.DataFrame(train_data)

train_df["ratio"] = abs (train_df["Actual"] - train_df["Predicted"]) / train_df["Actual"]
train_df["Difference"] = abs(train_df["Actual"] - train_df["Predicted"])

comparison_df = pd.concat([test_df, train_df], axis = 0)

og_comparison_df = pd.concat([comparison_df, listings_df], axis = 1)

og_comparison_df.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
W8018028,625000,596032.4,0.046348,28967.571429,https://toronto.listing.ca/2-fieldway-rd-701.W...,2 Fieldway Rd 701,625000,1,1,1,Fieldway Rd,Islington-City Centre West,Toronto,condo_apartment,2024-01-30,43.643148,-79.527023,0.643148,-0.527023
C7240366,1049000,947731.5,0.096538,101268.530476,https://toronto.listing.ca/15-holmes-ave-504.C...,15 Holmes Ave 504,1049000,2,3,0,Holmes Ave,Willowdale East,Toronto,condo_apartment,2024-01-30,43.777961,-79.414241,0.777961,-0.414241
W7254918,589900,561435.6,0.048253,28464.438,https://toronto.listing.ca/25-neighbourhood-la...,25 Neighbourhood Lane 811,589900,1,1,1,Neighbourhood Lane,Stonegate-Queensway,Toronto,condo_apartment,2024-01-30,43.638218,-79.489363,0.638218,-0.489363
W7294830,1395000,1334703.0,0.043223,60296.562,https://toronto.listing.ca/121-mulock-ave.W729...,121 Mulock Ave,1395000,4,3,1,Mulock Ave,Junction Area,Toronto,freehold _townhome,2024-01-30,43.670984,-79.465552,0.670984,-0.465552
C7358908,619900,697321.9,0.124894,77421.947095,https://toronto.listing.ca/330-adelaide-st-703...,330 Adelaide St 703,619900,1,1,1,Adelaide St,Moss Park,Toronto,condo_apartment,2024-01-30,43.647552,-79.392448,0.647552,-0.392448


In [547]:
sorted_comparison = og_comparison_df.sort_values(by="ratio", ascending=False)
sorted_comparison.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
E7387572,699000,1106244.0,0.582609,407243.656,https://toronto.listing.ca/113-mcintosh-st.E73...,113 Mcintosh St,699000,1,2,0,Mcintosh St,Birchcliffe-Cliffside,Toronto,detached_home,2024-01-30,43.706633,-79.252523,0.706633,-0.252523
C7272710,399000,619770.5,0.55331,220770.509293,https://toronto.listing.ca/14-york-st-409.C727...,14 York St 409,399000,1,1,0,York St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.642037,-79.381615,0.642037,-0.381615
E8017516,575000,841658.6,0.463754,266658.631238,https://toronto.listing.ca/485-logan-ave-101.E...,485 Logan Ave 101,575000,2,1,0,Logan Ave,South Riverdale,Toronto,condo_townhome,2024-01-30,43.666422,-79.344712,0.666422,-0.344712
C8029808,548000,795442.9,0.451538,247442.866533,https://toronto.listing.ca/1646-bathurst-st-8....,1646 Bathurst St 8,548000,1,2,0,Bathurst St,Humewood-Cedarvale,Toronto,condo_apartment,2024-01-30,43.693708,-79.422769,0.693708,-0.422769
C7313484,499900,719414.9,0.439118,219514.891981,https://toronto.listing.ca/263-wellington-st-5...,263 Wellington St 505,499900,1,1,1,Wellington St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.644702,-79.390959,0.644702,-0.390959


In [551]:
# Neighbourhood ratios (margin of error)
neigh_comparison = og_comparison_df.groupby("neighbourhood")

neigh_comparison["ratio"].mean().sort_values(ascending=False)

neighbourhood
O'Connor-Parkview          0.129935
Weston-Pellam Park         0.126361
Rouge E11                  0.123152
Cliffcrest                 0.122571
Oakwood Village            0.103221
                             ...   
York University Heights    0.029027
Bendale                    0.026135
Kensington-Chinatown       0.025106
Regent Park                0.020291
Yonge-Eglinton             0.019773
Name: ratio, Length: 80, dtype: float64

In [549]:
sorted_comparison.loc[sorted_comparison["neighbourhood"] == "Corso Italia-Davenport"]

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


In [550]:
sorted_comparison[["price", "beds", "dens"]].max()

price    1999000
beds           4
dens           2
dtype: int64