In [87]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [88]:
file = "listings_with_locations.csv"

file_path = Path("data/" + file)
listings_df = pd.read_csv(file_path, index_col="mls_id", header=0)


listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034


In [89]:
listings_df = listings_df[(listings_df['baths'] != 0)]
listings_df = listings_df[(listings_df['baths'] <= 5)]

listings_df = listings_df.dropna(subset=["latitude"])

In [90]:
# Function to remove outliers (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for each property type

for i in range(0,4):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df


for i in range(0,1):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'beds')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df


for i in range(0,1):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'dens')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df




In [91]:
listings_df.count()

url              3853
address          3853
price            3853
baths            3853
beds             3853
dens             3853
street           3853
neighbourhood    3853
city             3853
property_type    3853
date_scraped     3853
latitude         3853
longitude        3853
dtype: int64

In [92]:
listings_df['rel_latitude'] = listings_df['latitude'] - 43
listings_df['rel_longitude'] = listings_df['longitude'] + 79

listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034


In [93]:
# #Choosing 30 as a cutoff value
# neighbourhoods_to_replace = []
# for value, count in listings_df['neighbourhood'].value_counts().items():
#   if count < 30:
#     neighbourhoods_to_replace.append(value)

# # Replace in dataframe
# for app in neighbourhoods_to_replace:
#     listings_df['neighbourhood'] = listings_df['neighbourhood'].replace(app,"Other")

# # Check to make sure binning was successful
# #listings_df['neighbourhood'].value_counts()

In [94]:
listings_df.nunique()

url              3853
address          3848
price            1150
baths               5
beds                6
dens                6
street           1213
neighbourhood     143
city                1
property_type       4
date_scraped        1
latitude         2198
longitude        2199
rel_latitude     2198
rel_longitude    2199
dtype: int64

In [95]:
cat_columns = ["neighbourhood", "property_type"]
num_columns = ["baths", "beds", "dens", "rel_latitude", "rel_longitude"]
y = listings_df["price"]

In [96]:
listings_df_prepared = pd.get_dummies(listings_df[cat_columns], dtype=int)
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Woodbine-Lumsden,neighbourhood_Wychwood,neighbourhood_Yonge-Eglinton,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [97]:
listings_df_prepared[num_columns] = listings_df[num_columns]
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome,baths,beds,dens,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0.687469,-0.301861
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0.650343,-0.387806
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0.622946,-0.481658
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,2,0,0.658361,-0.351159
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0.665267,-0.341034


In [98]:
X = listings_df_prepared

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [100]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [101]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

rf_model = rf_model.fit(X_train_scaled, y_train)

In [102]:
predictions_train = rf_model.predict(X_train_scaled)

In [103]:
predictions_test = rf_model.predict(X_test_scaled)

In [104]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [105]:

rf_mse_train = mean_squared_error(y_train, predictions_train)
rf_mae_train = mean_absolute_error(y_train, predictions_train)

rf_mse_test = mean_squared_error(y_test, predictions_test)
rf_mae_test = mean_absolute_error(y_test, predictions_test)

In [106]:
from math import sqrt

In [107]:
print("Random Forest Train mse = ",rf_mse_train," & mae = ",rf_mae_train," & rmse = ", sqrt(rf_mse_train))

Random Forest Train mse =  1303224237.9103496  & mae =  9896.183108341987  & rmse =  36100.19720043576


In [108]:
print("Random Forest Test mse = ",rf_mse_test," & mae = ",rf_mae_test," & rmse = ", sqrt(rf_mse_test))

Random Forest Test mse =  85196689307.87137  & mae =  155688.0580912863  & rmse =  291884.71920926485


In [109]:
test_data = {"Actual": y_test,
        "Predicted": predictions_test}
test_df = pd.DataFrame(test_data)

test_df["ratio"] = abs (test_df["Actual"] - test_df["Predicted"]) / test_df["Actual"]
test_df["Difference"] = abs(test_df["Actual"] - test_df["Predicted"])


train_data = {"Actual": y_train,
        "Predicted": predictions_train}
train_df = pd.DataFrame(train_data)

train_df["ratio"] = abs (train_df["Actual"] - train_df["Predicted"]) / train_df["Actual"]
train_df["Difference"] = abs(train_df["Actual"] - train_df["Predicted"])

comparison_df = pd.concat([test_df, train_df], axis = 0)

og_comparison_df = pd.concat([comparison_df, listings_df], axis = 1)

#og_comparison_df.head()

og_comparison_df.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C7379048,849900,877000,0.031886,27100,https://toronto.listing.ca/871-sheppard-ave-10...,871 Sheppard Ave 107,849900,3,2,0,Sheppard Ave,Clanton Park,Toronto,condo_townhome,2024-01-30,43.82406,-79.108054,0.82406,-0.108054
W8031016,529900,359900,0.320815,170000,https://toronto.listing.ca/830-lawrence-ave-26...,830 Lawrence Ave 2609,529900,1,1,0,Lawrence Ave,Glenfield-Jane Heights,Toronto,condo_apartment,2024-01-30,43.714723,-79.454414,0.714723,-0.454414
E7383362,1199000,1080000,0.099249,119000,https://toronto.listing.ca/24-good-rd.E7383362...,24 Good Rd,1199000,4,3,0,Good Rd,Highland Creek,Toronto,detached_home,2024-01-30,43.794732,-79.184112,0.794732,-0.184112
C6691040,529900,465000,0.122476,64900,https://toronto.listing.ca/715-don-mills-rd-26...,715 Don Mills Rd 2606,529900,1,2,1,Don Mills Rd,Flemingdon Park,Toronto,condo_apartment,2024-01-30,43.710166,-79.334241,0.710166,-0.334241
W7265218,721600,588888,0.183914,132712,https://toronto.listing.ca/859-the-queensway-n...,859 The Queensway N/A 212,721600,1,1,1,The Queensway N/A,Stonegate-Queensway,Toronto,condo_apartment,2024-01-30,43.624217,-79.510284,0.624217,-0.510284


In [110]:
sorted_comparison = og_comparison_df.sort_values(by="ratio", ascending=False)
sorted_comparison.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
E8020546,949000,2500000,1.634352,1551000,https://toronto.listing.ca/914-broadview-ave.E...,914 Broadview Ave,949000,2,3,0,Broadview Ave,Playter Estates-Danforth,Toronto,freehold _townhome,2024-01-30,43.68056,-79.358018,0.68056,-0.358018
W8021980,1199000,2799000,1.334445,1600000,https://toronto.listing.ca/7-lorahill-rd.W8021...,7 Lorahill Rd,1199000,2,3,1,Lorahill Rd,Stonegate-Queensway,Toronto,detached_home,2024-01-30,43.638038,-79.50874,0.638038,-0.50874
C7280080,1559000,3500000,1.245029,1941000,https://toronto.listing.ca/227-ava-rd.C7280080...,227 Ava Rd,1559000,2,3,2,Ava Rd,Humewood-Cedarvale,Toronto,detached_home,2024-01-30,43.69545,-79.435762,0.69545,-0.435762
W7026116,598800,1199900,1.003841,601100,https://toronto.listing.ca/630-rogers-rd-10.W7...,630 Rogers Rd 10,598800,2,2,0,Rogers Rd,Keelesdale-Eglinton West,Toronto,condo_townhome,2024-01-30,43.682079,-79.474374,0.682079,-0.474374
W7406190,1499900,2999900,1.000067,1500000,https://toronto.listing.ca/6-bonnyview-dr.W740...,6 Bonnyview Dr,1499900,2,3,1,Bonnyview Dr,Stonegate-Queensway,Toronto,detached_home,2024-01-30,43.629187,-79.492364,0.629187,-0.492364


In [111]:
neigh_comparison = og_comparison_df.groupby("neighbourhood")

neigh_comparison["ratio"].mean().sort_values(ascending=False)

neighbourhood
Playter Estates-Danforth         0.817176
Keelesdale-Eglinton West         0.337283
Pleasant View                    0.250417
Weston-Pellam Park               0.217829
Humewood-Cedarvale               0.163358
                                   ...   
Thistletown-Beaumonde Heights    0.000000
Woodbine Corridor                0.000000
Forest Hill South                0.000000
Wychwood                         0.000000
Rustic                           0.000000
Name: ratio, Length: 143, dtype: float64

In [112]:
sorted_comparison.loc[sorted_comparison["neighbourhood"] == "Banbury-Don Mills"]

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C7345696,849000,599000,0.294464,250000,https://toronto.listing.ca/18-concorde-pl-528....,18 Concorde Pl 528,849000,2,2,0,Concorde Pl,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.731426,-79.328426,0.731426,-0.328426
C7399662,829999,999000,0.203616,169001,https://toronto.listing.ca/181-wynford-dr-2409...,181 Wynford Dr 2409,829999,2,3,0,Wynford Dr,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.725089,-79.326078,0.725089,-0.326078
C7386200,2999990,3588000,0.196004,588010,https://toronto.listing.ca/4-chipstead-rd.C738...,4 Chipstead Rd,2999990,5,3,1,Chipstead Rd,Banbury-Don Mills,Toronto,detached_home,2024-01-30,43.748589,-79.372903,0.748589,-0.372903
C8022108,739000,869000,0.175913,130000,https://toronto.listing.ca/205-the-don-way-527...,205 The Don Way 527,739000,4,2,0,The Don Way,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.737737,-79.345743,0.737737,-0.345743
C8028886,749000,621888,0.169709,127112,https://toronto.listing.ca/181-wynford-dr-711....,181 Wynford Dr 711,749000,2,2,0,Wynford Dr,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.725089,-79.326078,0.725089,-0.326078
C8026230,539000,608000,0.128015,69000,https://toronto.listing.ca/120-dallimore-circ-...,120 Dallimore Circ 124,539000,1,1,1,Dallimore Circ,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.730057,-79.332332,0.730057,-0.332332
C7362550,699000,780000,0.11588,81000,https://toronto.listing.ca/5-concorde-pl-3102....,5 Concorde Pl 3102,699000,2,2,1,Concorde Pl,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.729936,-79.327304,0.729936,-0.327304
C8031008,580000,524900,0.095,55100,https://toronto.listing.ca/181-wynford-dr-1201...,181 Wynford Dr 1201,580000,1,1,0,Wynford Dr,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.725089,-79.326078,0.725089,-0.326078
C7200462,689900,634900,0.079722,55000,https://toronto.listing.ca/99-the-donway-west-...,99 The Donway West N/A 525,689900,2,1,1,The Donway West N/A,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.738936,-79.347056,0.738936,-0.347056
C7298294,598900,642900,0.073468,44000,https://toronto.listing.ca/20-oneill-rd-233.C7...,20 O'neill Rd 233,598900,1,1,0,O'neill Rd,Banbury-Don Mills,Toronto,condo_apartment,2024-01-30,43.732172,-79.344321,0.732172,-0.344321


In [113]:
sorted_comparison[["price", "beds", "dens"]].max()

price    3599000
beds           5
dens           5
dtype: int64