In [369]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [370]:
file = "listings_with_locations.csv"

file_path = Path("data/" + file)
listings_df = pd.read_csv(file_path, index_col="mls_id", header=0)


listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034


In [371]:
listings_df = listings_df[(listings_df['baths'] != 0)]
listings_df = listings_df[(listings_df['baths'] <= 5)]

listings_df = listings_df.dropna(subset=["latitude"])

In [372]:
# Function to remove outliers (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for each property type

for i in range(0,4):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df


for i in range(0,1):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'beds')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df


for i in range(0,1):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'dens')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df



In [373]:
listings_df.count()

url              3853
address          3853
price            3853
baths            3853
beds             3853
dens             3853
street           3853
neighbourhood    3853
city             3853
property_type    3853
date_scraped     3853
latitude         3853
longitude        3853
dtype: int64

In [374]:
listings_df['rel_latitude'] = listings_df['latitude'] - 43
listings_df['rel_longitude'] = listings_df['longitude'] + 79

listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034


In [375]:
# #Choosing 20 as a cutoff value
# neighbourhoods_to_replace = []
# for value, count in listings_df['neighbourhood'].value_counts().items():
#   if count < 20:
#     neighbourhoods_to_replace.append(value)

# # Replace in dataframe
# for app in neighbourhoods_to_replace:
#     listings_df['neighbourhood'] = listings_df['neighbourhood'].replace(app,"Other")

# # Check to make sure binning was successful
# #listings_df['neighbourhood'].value_counts()

In [376]:
cat_columns = ["neighbourhood", "property_type"]
num_columns = ["baths", "beds", "dens", "rel_latitude", "rel_longitude"]
y = listings_df["price"]

In [377]:
listings_df_prepared = pd.get_dummies(listings_df[cat_columns], dtype=int)
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Woodbine-Lumsden,neighbourhood_Wychwood,neighbourhood_Yonge-Eglinton,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [378]:
listings_df_prepared[num_columns] = listings_df[num_columns]
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome,baths,beds,dens
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,1
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,1
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,2,2,0
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0


In [379]:
X = listings_df_prepared

In [380]:
# Create a model with scikit-learn
model = LinearRegression()

# Fit the data into the model
model.fit(X, y)

In [381]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [3.86386975e+12 3.86386984e+12 3.86386988e+12 3.86387023e+12
 3.86387008e+12 3.86386995e+12 3.86387014e+12 3.86387012e+12
 3.86387003e+12 3.86387033e+12 3.86386998e+12 3.86386987e+12
 3.86386994e+12 3.86386961e+12 3.86387004e+12 3.86386976e+12
 3.86387002e+12 3.86387012e+12 3.86386978e+12 3.86387010e+12
 3.86386959e+12 3.86387034e+12 3.86386950e+12 3.86387009e+12
 3.86386969e+12 3.86386992e+12 3.86386981e+12 3.86386970e+12
 3.86386980e+12 3.86386984e+12 3.86387005e+12 3.86386997e+12
 3.86386974e+12 3.86386996e+12 3.86386977e+12 3.86387001e+12
 3.86386998e+12 3.86387000e+12 3.86387009e+12 3.86386980e+12
 3.86386970e+12 3.86386995e+12 3.86386984e+12 3.86386991e+12
 3.86386984e+12 3.86387029e+12 3.86387018e+12 3.86386978e+12
 3.86386981e+12 3.86386981e+12 3.86387000e+12 3.86387014e+12
 3.86387008e+12 3.86386957e+12 3.86386984e+12 3.86386990e+12
 3.86386974e+12 3.86386964e+12 3.86386980e+12 3.86386974e+12
 3.86387014e+12 3.86386988e+12 3.86387001e+12 3.86387001e+12
 3.863869

In [382]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: -1.4715416261653837e+17


In [383]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = -1.4715416261653837e+17 + 3863869754077.867X


In [384]:
predicted_y_values = model.predict(X)

In [385]:
# Create a copy of the original data
df_listings_predicted = listings_df.copy()

# Add a column with the predicted salary values
df_listings_predicted["salary_predicted"] = predicted_y_values

# Display sample data
df_listings_predicted.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude,salary_predicted
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861,525152.0
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806,653632.0
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658,593152.0
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159,865888.0
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034,586624.0


In [386]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [387]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
mae = mean_absolute_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The mean average error is {mae}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.8075145038027786.
The r2 is 0.8075145038027786.
The mean squared error is 51833272728.08954.
The mean average error is 144920.67246301583.
The root mean squared error is 227669.2177877579.
The standard deviation is 518925.8554787994.


In [388]:
test_data = {"Actual": y,
        "Predicted": predicted_y_values}
test_df = pd.DataFrame(test_data)

test_df["ratio"] = abs (test_df["Actual"] - test_df["Predicted"]) / test_df["Actual"]
test_df["Difference"] = abs(test_df["Actual"] - test_df["Predicted"])


og_comparison_df = pd.concat([test_df, listings_df], axis = 1)

#og_comparison_df.head()

og_comparison_df.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
E8018446,619900,525152.0,0.152844,94748.0,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861
C7266728,529000,653632.0,0.235599,124632.0,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806
W7239426,624900,593152.0,0.050805,31748.0,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658
E8030950,899900,865888.0,0.037795,34012.0,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159
E8030860,599900,586624.0,0.02213,13276.0,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034


In [389]:
sorted_comparison = og_comparison_df.sort_values(by="ratio", ascending=False)
sorted_comparison.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C7053662,6980,2436704.0,348.097994,2429724.0,https://toronto.listing.ca/65-ames-circ.C70536...,65 Ames Circ,6980,5,5,2,Ames Circ,Banbury-Don Mills,Toronto,detached_home,2024-01-30,43.75125,-79.369315,0.75125,-0.369315
C7403144,419900,1154304.0,1.748997,734404.0,https://toronto.listing.ca/3555-bathurst-st-60...,3555 Bathurst St 601,419900,2,2,0,Bathurst St,Bedford Park-Nortown,Toronto,condo_apartment,2024-01-30,43.730588,-79.432202,0.730588,-0.432202
W7337020,944000,2322080.0,1.459831,1378080.0,https://toronto.listing.ca/189-hay-ave.W733702...,189 Hay Ave,944000,5,4,1,Hay Ave,Mimico,Toronto,detached_home,2024-01-30,43.617208,-79.508384,0.617208,-0.508384
C7338886,278901,680064.0,1.438371,401163.0,https://toronto.listing.ca/45-industrial-st-20...,45 Industrial St 205,278901,1,0,0,Industrial St,Leaside,Toronto,condo_apartment,2024-01-30,43.708822,-79.354625,0.708822,-0.354625
C8020038,389000,827808.0,1.128041,438808.0,https://toronto.listing.ca/2550-bathurst-st-10...,2550 Bathurst St 109,389000,1,1,0,Bathurst St,Forest Hill North,Toronto,condo_apartment,2024-01-30,43.706646,-79.427185,0.706646,-0.427185


In [390]:
neigh_comparison = og_comparison_df.groupby("neighbourhood")

neigh_comparison["ratio"].mean().sort_values(ascending=False)

neighbourhood
Banbury-Don Mills           8.484332
West Humber-Clairville      0.387880
Forest Hill North           0.384157
Bedford Park-Nortown        0.358968
Playter Estates-Danforth    0.344892
                              ...   
Rustic                      0.079265
Greenwood-Coxwell           0.068563
Roncesvalles                0.067923
Ionview                     0.002218
Forest Hill South           0.000007
Name: ratio, Length: 143, dtype: float64

In [391]:
listings_df[["price", "beds", "dens", "baths"]].max()

price    3599000
beds           5
dens           5
baths          5
dtype: int64