In [153]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [154]:
file = "listings_with_locations.csv"

file_path = Path("data/" + file)
listings_df = pd.read_csv(file_path, index_col="mls_id", header=0)


listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034


In [155]:
listings_df = listings_df[(listings_df['baths'] != 0)]

listings_df = listings_df.dropna(subset=["latitude"])

In [156]:
listings_df['rel_latitude'] = listings_df['latitude'] - 43
listings_df['rel_longitude'] = listings_df['longitude'] + 79

listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034


In [157]:
# #Choosing 500 as a cutoff value
# neighbourhoods_to_replace = []
# for value, count in listings_df['neighbourhood'].value_counts().items():
#   if count < 20:
#     neighbourhoods_to_replace.append(value)

# # Replace in dataframe
# for app in neighbourhoods_to_replace:
#     listings_df['neighbourhood'] = listings_df['neighbourhood'].replace(app,"Other")

# # Check to make sure binning was successful
# #listings_df['neighbourhood'].value_counts()

In [158]:
cat_columns = ["neighbourhood", "property_type"]
num_columns = ["baths", "beds", "dens", "rel_latitude", "rel_longitude"]
y = listings_df["price"]

In [159]:
listings_df_prepared = pd.get_dummies(listings_df[cat_columns], dtype=int)
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Woodbine-Lumsden,neighbourhood_Wychwood,neighbourhood_Yonge-Eglinton,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [160]:
listings_df_prepared[num_columns] = listings_df[num_columns]
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome,baths,beds,dens,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0.687469,-0.301861
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0.650343,-0.387806
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0.622946,-0.481658
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,2,0,0.658361,-0.351159
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0.665267,-0.341034


In [161]:
X = listings_df_prepared

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [163]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [164]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

rf_model = rf_model.fit(X_train_scaled, y_train)

In [165]:
predictions_train = rf_model.predict(X_train_scaled)

In [166]:
predictions_test = rf_model.predict(X_test_scaled)

In [167]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [168]:

rf_mse_train = mean_squared_error(y_train, predictions_train)
rf_mae_train = mean_absolute_error(y_train, predictions_train)

rf_mse_test = mean_squared_error(y_test, predictions_test)
rf_mae_test = mean_absolute_error(y_test, predictions_test)

In [169]:
from math import sqrt

In [170]:
print("Random Forest Train mse = ",rf_mse_train," & mae = ",rf_mae_train," & rmse = ", sqrt(rf_mse_train))

Random Forest Train mse =  16249864466.641062  & mae =  16554.669302456932  & rmse =  127474.9562331404


In [171]:
print("Random Forest Test mse = ",rf_mse_test," & mae = ",rf_mae_test," & rmse = ", sqrt(rf_mse_test))

Random Forest Test mse =  2534276591987.4507  & mae =  446409.4149026249  & rmse =  1591941.1396114652


In [172]:
test_data = {"Actual": y_test,
        "Predicted": predictions_test}
test_df = pd.DataFrame(test_data)

test_df["ratio"] = abs (test_df["Actual"] - test_df["Predicted"]) / test_df["Actual"]
test_df["Difference"] = abs(test_df["Actual"] - test_df["Predicted"])


train_data = {"Actual": y_train,
        "Predicted": predictions_train}
train_df = pd.DataFrame(train_data)

train_df["ratio"] = abs (train_df["Actual"] - train_df["Predicted"]) / train_df["Actual"]
train_df["Difference"] = abs(train_df["Actual"] - train_df["Predicted"])

comparison_df = pd.concat([test_df, train_df], axis = 0)

og_comparison_df = pd.concat([comparison_df, listings_df], axis = 1)

#og_comparison_df.head()

og_comparison_df.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C7385816,1348000,5500000,3.080119,4152000,https://toronto.listing.ca/70-hargrave-lane-7....,70 Hargrave Lane 7,1348000,3,3,0,Hargrave Lane,Bridle Path-Sunnybrook-York Mills,Toronto,condo_townhome,2024-01-30,43.722366,-79.379318,0.722366,-0.379318
C7384564,825000,1275000,0.545455,450000,https://toronto.listing.ca/120-bayview-ave-n10...,120 Bayview Ave N1009,825000,2,2,0,Bayview Ave,Waterfront Communities C8,Toronto,condo_apartment,2024-01-30,43.72584,-79.38058,0.72584,-0.38058
C7375126,995888,938000,0.058127,57888,https://toronto.listing.ca/887-bay-st-1106.C73...,887 Bay St 1106,995888,2,2,0,Bay St,Bay Street Corridor,Toronto,condo_apartment,2024-01-30,43.662983,-79.386292,0.662983,-0.386292
C7267212,649000,570000,0.121726,79000,https://toronto.listing.ca/33-helendale-ave-21...,33 Helendale Ave 2101,649000,1,2,0,Helendale Ave,Yonge-Eglinton,Toronto,condo_apartment,2024-01-30,43.708715,-79.399383,0.708715,-0.399383
E7300824,499500,425000,0.149149,74500,https://toronto.listing.ca/20-gilder-dr-905.E7...,20 Gilder Dr 905,499500,1,2,0,Gilder Dr,Eglinton East,Toronto,condo_apartment,2024-01-30,43.736498,-79.257993,0.736498,-0.257993


In [178]:
sorted_comparison = og_comparison_df.sort_values(by="ratio", ascending=False)
sorted_comparison.head(20)

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C7402374,1144800,6645000,4.804507,5500200,https://toronto.listing.ca/250-lawrence-ave-90...,250 Lawrence Ave 907,1144800,2,2,0,Lawrence Ave,Lawrence Park North,Toronto,condo_apartment,2024-01-30,43.7227,-79.414192,0.7227,-0.414192
C8021488,999000,5500000,4.505506,4501000,https://toronto.listing.ca/70-hargrave-lane-12...,70 Hargrave Lane 12,999000,3,3,1,Hargrave Lane,Bridle Path-Sunnybrook-York Mills,Toronto,condo_townhome,2024-01-30,43.722366,-79.379318,0.722366,-0.379318
C7063270,2125000,9500000,3.470588,7375000,https://toronto.listing.ca/300-front-st-4404.C...,300 Front St 4404,2125000,3,2,1,Front St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.643985,-79.389342,0.643985,-0.389342
C7385816,1348000,5500000,3.080119,4152000,https://toronto.listing.ca/70-hargrave-lane-7....,70 Hargrave Lane 7,1348000,3,3,0,Hargrave Lane,Bridle Path-Sunnybrook-York Mills,Toronto,condo_townhome,2024-01-30,43.722366,-79.379318,0.722366,-0.379318
C5939643,7500000,28000000,2.733333,20500000,https://toronto.listing.ca/19-royal-oak-dr.C59...,19 Royal Oak Dr,7500000,9,7,0,Royal Oak Dr,Bridle Path-Sunnybrook-York Mills,Toronto,detached_home,2024-01-30,43.72797,-79.361178,0.72797,-0.361178
C8022942,2395000,8325000,2.475992,5930000,https://toronto.listing.ca/111-st-clair-ave-81...,111 St Clair Ave 819,2395000,4,3,0,St Clair Ave,Yonge-St. Clair,Toronto,condo_apartment,2024-01-30,43.686798,-79.399072,0.686798,-0.399072
C7259238,1685000,5590000,2.317507,3905000,https://toronto.listing.ca/102-bloor-st-ph6.C7...,102 Bloor St Ph6,1685000,3,2,0,Bloor St,Annex,Toronto,condo_apartment,2024-01-30,43.669402,-79.391644,0.669402,-0.391644
C7045680,1499000,4595000,2.065377,3096000,https://toronto.listing.ca/1001-bay-st-ph02.C7...,1001 Bay St Ph02,1499000,3,2,1,Bay St,Bay Street Corridor,Toronto,condo_apartment,2024-01-30,43.66536,-79.387202,0.66536,-0.387202
C7275918,5399000,16500000,2.056122,11101000,https://toronto.listing.ca/111-roxborough-dr.C...,111 Roxborough Dr,5399000,4,4,1,Roxborough Dr,Rosedale-Moore Park,Toronto,detached_home,2024-01-30,43.682277,-79.376451,0.682277,-0.376451
C7302848,7880000,23500000,1.982234,15620000,https://toronto.listing.ca/311-bay-st-4801.C73...,311 Bay St 4801,7880000,5,3,0,Bay St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.649755,-79.380323,0.649755,-0.380323


In [173]:
neigh_comparison = og_comparison_df.groupby("neighbourhood")

neigh_comparison["ratio"].mean().sort_values(ascending=False)

neighbourhood
Lawrence Park North                  0.542845
Bridle Path-Sunnybrook-York Mills    0.534552
Yonge-St. Clair                      0.271191
Woodbine Corridor                    0.166578
Lawrence Park South                  0.163018
                                       ...   
Keelesdale-Eglinton West             0.000000
Blake-Jones                          0.000000
Rexdale-Kipling                      0.000000
Playter Estates-Danforth             0.000000
Pleasant View                        0.000000
Name: ratio, Length: 143, dtype: float64

In [174]:
annex_df = og_comparison_df.loc[og_comparison_df['neighbourhood']=="Annex", :]

annex_df["price"].max()

9999000