In [26]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import pickle

In [27]:
file = "listings_with_locations.csv"

file_path = Path("../data/" + file)
listings_df = pd.read_csv(file_path, index_col="mls_id", header=0)


listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034


In [28]:
listings_df = listings_df[(listings_df['baths'] != 0)]
listings_df = listings_df[(listings_df['baths'] <= 5)]

listings_df = listings_df.dropna(subset=["latitude"])

In [29]:
# Function to remove outliers (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for each property type

for i in range(0,4):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df


for i in range(0,1):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'beds')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df


for i in range(0,1):
    cleaned_df = pd.DataFrame()

    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'dens')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])

    listings_df = cleaned_df



In [30]:
listings_df.count()

url              3853
address          3853
price            3853
baths            3853
beds             3853
dens             3853
street           3853
neighbourhood    3853
city             3853
property_type    3853
date_scraped     3853
latitude         3853
longitude        3853
dtype: int64

In [31]:
listings_df['rel_latitude'] = listings_df['latitude'] - 43
listings_df['rel_longitude'] = listings_df['longitude'] + 79

listings_df.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034


In [32]:
# #Choosing 20 as a cutoff value
# neighbourhoods_to_replace = []
# for value, count in listings_df['neighbourhood'].value_counts().items():
#   if count < 20:
#     neighbourhoods_to_replace.append(value)

# # Replace in dataframe
# for app in neighbourhoods_to_replace:
#     listings_df['neighbourhood'] = listings_df['neighbourhood'].replace(app,"Other")

# # Check to make sure binning was successful
# #listings_df['neighbourhood'].value_counts()

In [33]:
cat_columns = ["neighbourhood", "property_type"]
num_columns = ["baths", "beds", "dens"]
y = listings_df["price"]

In [34]:
listings_df_prepared = pd.get_dummies(listings_df[cat_columns], dtype=int)
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Woodbine-Lumsden,neighbourhood_Wychwood,neighbourhood_Yonge-Eglinton,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [35]:
listings_df_prepared[num_columns] = listings_df[num_columns]
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt North,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bathurst Manor,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bayview Woods-Steeles,neighbourhood_Bedford Park-Nortown,...,neighbourhood_Yonge-St. Clair,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold _townhome,baths,beds,dens
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E8018446,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,1
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
W7239426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,1
E8030950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,2,2,0
E8030860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0


In [36]:
X = listings_df_prepared

In [37]:
scaler = StandardScaler()

housingScaler = scaler.fit(X)


pkl_scaler = "housingScaler.pkl"  

with open(pkl_scaler, 'wb') as file:  
    pickle.dump(housingScaler, file)

In [38]:

X_scaled = housingScaler.transform(X)


In [39]:
# Create a model with scikit-learn
model = LinearRegression()

# Fit the data into the model
model.fit(X_scaled, y)

In [40]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [-1.07763259e+15 -2.17437499e+15 -1.34546001e+15 -1.83949302e+15
 -1.74003885e+15 -8.94107725e+14 -2.81525468e+15 -2.40404823e+15
 -7.62795267e+14 -1.11065349e+15 -8.08961342e+14 -1.37192614e+15
 -1.80040227e+15 -9.71744548e+14 -4.67418413e+14 -8.08961342e+14
 -3.81695097e+14 -4.67418413e+14 -1.26264784e+15 -1.42334297e+15
 -8.08961342e+14 -9.71744548e+14 -8.08961342e+14 -3.81264196e+15
 -9.71744548e+14 -1.80040227e+15 -1.23377857e+15 -5.39658194e+14
 -6.60771865e+14 -4.67418413e+14 -7.62795267e+14 -1.31844837e+15
 -1.34546001e+15 -1.23377857e+15 -2.17437499e+15 -8.08961342e+14
 -8.94107725e+14 -7.13622417e+14 -1.07763259e+15 -1.07763259e+15
 -1.04354922e+15 -1.29085699e+15 -1.20420169e+15 -1.26264784e+15
 -1.39787774e+15 -7.62795267e+14 -2.69934232e+14 -1.20420169e+15
 -9.33743520e+14 -1.11065349e+15 -1.82005779e+15 -9.33743520e+14
 -1.87772989e+15 -8.94107725e+14 -1.07763259e+15 -1.11065349e+15
 -1.14270404e+15 -8.08961342e+14 -9.33743520e+14 -9.71744548e+14
 -1.008295

In [41]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: 917820.2637635693


In [42]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = 917820.2637635693 + -1077632585846491.6X


In [43]:
predicted_y_values = model.predict(X_scaled)

In [44]:
# Create a copy of the original data
df_listings_predicted = listings_df.copy()

# Add a column with the predicted salary values
df_listings_predicted["salary_predicted"] = predicted_y_values

# Display sample data
df_listings_predicted.head()

Unnamed: 0_level_0,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude,salary_predicted
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
E8018446,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861,530250.86786
C7266728,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806,654889.453158
W7239426,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658,586570.86786
E8030950,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159,863059.756376
E8030860,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034,585257.453158


In [45]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [46]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
mae = mean_absolute_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The mean average error is {mae}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is -1.3181137620906996e+26.
The r2 is 0.8071374423289273.
The mean squared error is 51934809366.41039.
The mean average error is 145007.49335078217.
The root mean squared error is 227892.10027205944.
The standard deviation is 518925.8554787994.




In [47]:
test_data = {"Actual": y,
        "Predicted": predicted_y_values}
test_df = pd.DataFrame(test_data)

test_df["ratio"] = abs (test_df["Actual"] - test_df["Predicted"]) / test_df["Actual"]
test_df["Difference"] = abs(test_df["Actual"] - test_df["Predicted"])


og_comparison_df = pd.concat([test_df, listings_df], axis = 1)

#og_comparison_df.head()

og_comparison_df.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
E8018446,619900,530250.86786,0.144619,89649.13214,https://toronto.listing.ca/286-main-st-911.E80...,286 Main St 911,619900,1,1,1,Main St,East End-Danforth,Toronto,condo_apartment,2024-01-30,43.687469,-79.301861,0.687469,-0.301861
C7266728,529000,654889.453158,0.237976,125889.453158,https://toronto.listing.ca/215-queen-st-606.C7...,215 Queen St 606,529000,1,1,0,Queen St,Waterfront Communities C1,Toronto,condo_apartment,2024-01-30,43.650343,-79.387806,0.650343,-0.387806
W7239426,624900,586570.86786,0.061336,38329.13214,https://toronto.listing.ca/10-park-lawn-rd-140...,10 Park Lawn Rd 1408,624900,1,1,1,Park Lawn Rd,Mimico,Toronto,condo_apartment,2024-01-30,43.622946,-79.481658,0.622946,-0.481658
E8030950,899900,863059.756376,0.040938,36840.243624,https://toronto.listing.ca/665-queen-st-402.E8...,665 Queen St 402,899900,2,2,0,Queen St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.658361,-79.351159,0.658361,-0.351159
E8030860,599900,585257.453158,0.024408,14642.546842,https://toronto.listing.ca/1190-dundas-st-925....,1190 Dundas St 925,599900,1,1,0,Dundas St,South Riverdale,Toronto,condo_apartment,2024-01-30,43.665267,-79.341034,0.665267,-0.341034


In [48]:
sorted_comparison = og_comparison_df.sort_values(by="ratio", ascending=False)
sorted_comparison.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,url,address,price,baths,beds,dens,street,neighbourhood,city,property_type,date_scraped,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
C7053662,6980,2431125.0,347.298782,2424145.0,https://toronto.listing.ca/65-ames-circ.C70536...,65 Ames Circ,6980,5,5,2,Ames Circ,Banbury-Don Mills,Toronto,detached_home,2024-01-30,43.75125,-79.369315,0.75125,-0.369315
C7403144,419900,1149780.0,1.738223,729879.8,https://toronto.listing.ca/3555-bathurst-st-60...,3555 Bathurst St 601,419900,2,2,0,Bathurst St,Bedford Park-Nortown,Toronto,condo_apartment,2024-01-30,43.730588,-79.432202,0.730588,-0.432202
C7338886,278901,683931.9,1.452239,405030.9,https://toronto.listing.ca/45-industrial-st-20...,45 Industrial St 205,278901,1,0,0,Industrial St,Leaside,Toronto,condo_apartment,2024-01-30,43.708822,-79.354625,0.708822,-0.354625
W7337020,944000,2313447.0,1.450685,1369447.0,https://toronto.listing.ca/189-hay-ave.W733702...,189 Hay Ave,944000,5,4,1,Hay Ave,Mimico,Toronto,detached_home,2024-01-30,43.617208,-79.508384,0.617208,-0.508384
E7262068,999999,2088151.0,1.088153,1088152.0,https://toronto.listing.ca/50-freeman-st.E7262...,50 Freeman St,999999,4,5,1,Freeman St,Birchcliffe-Cliffside,Toronto,detached_home,2024-01-30,43.692762,-79.266065,0.692762,-0.266065


In [49]:
neigh_comparison = og_comparison_df.groupby("neighbourhood")

neigh_comparison["ratio"].mean().sort_values(ascending=False)

neighbourhood
Banbury-Don Mills         8.465220
West Humber-Clairville    0.388870
Forest Hill North         0.368278
Bedford Park-Nortown      0.356454
Mount Dennis              0.348217
                            ...   
Broadview North           0.073783
Roncesvalles              0.068965
Greenwood-Coxwell         0.068651
Forest Hill South         0.022056
Ionview                   0.001931
Name: ratio, Length: 143, dtype: float64

In [50]:
listings_df[["price", "beds", "dens", "baths"]].max()

price    3599000
beds           5
dens           5
baths          5
dtype: int64

In [51]:

pkl_model = "housingModel.pkl"  

with open(pkl_model, 'wb') as file:  
    pickle.dump(model, file)