In [41]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
import pickle
import lzma
import psycopg2

In [42]:
def connect_to_database():
    try:
        conn = psycopg2.connect(host= 'localhost',
        user =  'postgres',
        password=  'postgres',
        dbname = "listings_db",
        port =  5432
    )
        return conn
    except Exception as error:
        print(f"Error: Unable to connect to the dataBase - {str(error)}")
        raise ConnectionError(f"Error: Unable to connect to the Database - {str(error)}")

In [43]:
connection = connect_to_database()

if connection:
    try:
        query = "SELECT mls_id, price, baths, beds, dens, neighbourhood, city, property_type, latitude, longitude FROM toronto_listings;"
        listings_df = pd.read_sql(query, connection, index_col="mls_id")

    except Exception as error:
        print(f"Error: Unable to fetch from database - {str(error)}")
    finally:
        connection.close()

  listings_df = pd.read_sql(query, connection, index_col="mls_id")


In [44]:
listings_df.head()

Unnamed: 0_level_0,price,baths,beds,dens,neighbourhood,city,property_type,latitude,longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
E8018446,619900,1,1,1,East End-Danforth,Toronto,condo_apartment,43.687469,-79.301861
C7266728,529000,1,1,0,Waterfront Communities C1,Toronto,condo_apartment,43.650343,-79.387806
W7239426,624900,1,1,1,Mimico,Toronto,condo_apartment,43.622946,-79.481658
E8030950,899900,2,2,0,South Riverdale,Toronto,condo_apartment,43.658361,-79.351159
E8030860,599900,1,1,0,South Riverdale,Toronto,condo_apartment,43.665267,-79.341034


In [45]:
# Remove luxury listings with more than 5 bathrooms or more than 4 beds 
listings_df = listings_df[(listings_df['baths'] != 0) & (listings_df['baths'] <= 5) & (listings_df['beds'] <= 4) & (listings_df['beds'] != 0) & (listings_df['dens'] <= 2)  & (listings_df['price'] <= 2000000)]

# Remove rows with missing latitude values
listings_df = listings_df.dropna(subset=["latitude"])

In [46]:
# Function to remove outliers (IQR)
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for each property type

for i in range(4):
    cleaned_df = pd.DataFrame()
    for property_type in listings_df['neighbourhood'].unique():
        subset_df = listings_df[listings_df['neighbourhood'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])
    listings_df = cleaned_df

    cleaned_df = pd.DataFrame()
    for property_type in listings_df['property_type'].unique():
        subset_df = listings_df[listings_df['property_type'] == property_type]
        cleaned_subset = remove_outliers(subset_df, 'price')
        cleaned_df = pd.concat([cleaned_df, cleaned_subset])
    listings_df = cleaned_df

In [47]:
listings_df = listings_df[listings_df.groupby('neighbourhood').city.transform('count')>10].copy() 

In [48]:
listings_df.count()

price            2862
baths            2862
beds             2862
dens             2862
neighbourhood    2862
city             2862
property_type    2862
latitude         2862
longitude        2862
dtype: int64

In [49]:
# Feature engineering
listings_df['rel_latitude'] = listings_df['latitude'] - 43
listings_df['rel_longitude'] = listings_df['longitude'] + 79

listings_df.head()

Unnamed: 0_level_0,price,baths,beds,dens,neighbourhood,city,property_type,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C7266728,529000,1,1,0,Waterfront Communities C1,Toronto,condo_apartment,43.650343,-79.387806,0.650343,-0.387806
C8030612,1039000,2,2,0,Waterfront Communities C1,Toronto,condo_apartment,43.645471,-79.391166,0.645471,-0.391166
C8030426,999000,2,1,1,Waterfront Communities C1,Toronto,condo_apartment,43.645397,-79.391362,0.645397,-0.391362
C8030300,1037000,2,2,0,Waterfront Communities C1,Toronto,condo_apartment,43.645471,-79.391166,0.645471,-0.391166
C8029936,665000,1,1,1,Waterfront Communities C1,Toronto,condo_apartment,43.646841,-79.402888,0.646841,-0.402888


In [50]:
listings_df.nunique()

price             825
baths               5
beds                4
dens                3
neighbourhood      80
city                1
property_type       4
latitude         1402
longitude        1403
rel_latitude     1402
rel_longitude    1403
dtype: int64

In [51]:
listings_df['neighbourhood'].value_counts()

neighbourhood
Waterfront Communities C1     327
Church-Yonge Corridor         187
Mimico                        149
Islington-City Centre West    101
Bay Street Corridor           100
                             ... 
Edenbridge-Humber Valley       12
Humewood-Cedarvale             12
Greenwood-Coxwell              12
Yonge-Eglinton                 11
North St. James Town           11
Name: count, Length: 80, dtype: int64

In [52]:
# Prepare data for training
cat_columns = ["neighbourhood", "property_type"]
num_columns = ["baths", "beds", "dens", "rel_latitude", "rel_longitude"]
y = listings_df["price"]

In [53]:
listings_df_prepared = pd.get_dummies(listings_df[cat_columns], dtype=int)
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bendale,neighbourhood_Birchcliffe-Cliffside,neighbourhood_Black Creek,neighbourhood_Brookhaven-Amesbury,...,neighbourhood_Willowdale East,neighbourhood_Willowdale West,neighbourhood_Woburn,neighbourhood_Yonge-Eglinton,neighbourhood_York University Heights,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold_townhome
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C8030612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C8030426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C8030300,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
C8029936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [54]:
listings_df_prepared[num_columns] = listings_df[num_columns]
listings_df_prepared.head()

Unnamed: 0_level_0,neighbourhood_Agincourt South-Malvern West,neighbourhood_Alderwood,neighbourhood_Annex,neighbourhood_Banbury-Don Mills,neighbourhood_Bay Street Corridor,neighbourhood_Bayview Village,neighbourhood_Bendale,neighbourhood_Birchcliffe-Cliffside,neighbourhood_Black Creek,neighbourhood_Brookhaven-Amesbury,...,neighbourhood_Yorkdale-Glen Park,property_type_condo_apartment,property_type_condo_townhome,property_type_detached_home,property_type_freehold_townhome,baths,beds,dens,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C7266728,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0.650343,-0.387806
C8030612,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,2,0,0.645471,-0.391166
C8030426,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,1,1,0.645397,-0.391362
C8030300,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,2,0,0.645471,-0.391166
C8029936,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,1,0.646841,-0.402888


In [55]:
fit_columns = listings_df_prepared.columns

with open("fit_columns.pkl", 'wb') as file:  
    pickle.dump(fit_columns, file)

In [56]:
X = listings_df_prepared
y = listings_df["price"]

In [57]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [58]:
# Create a pipeline with StandardScaler and RandomForestRegressor
rf_pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=500, random_state=1))

In [59]:
# Train the model and evaluate it using cross-validation
cv_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

print("Cross-Validation MAE Scores:", -cv_scores)
print("Mean CV MAE:", -cv_scores.mean())

Cross-Validation MAE Scores: [77623.68889667 81701.95736031 74154.66405997 79421.65399856
 75535.31829726]
Mean CV MAE: 77687.4565225545


In [60]:
# Fit the model on the training set
rf_pipeline.fit(X_train, y_train)

# Make predictions on the test set
predictions_test = rf_pipeline.predict(X_test)

In [61]:
pkl_model = "housingModel_pkl.xz"  

with lzma.open(pkl_model, "wb") as file:
    pickle.dump(rf_pipeline, file)

#with open(pkl_model, 'wb') as file:  
#    pickle.dump(rf_pipeline, file)

In [62]:
# Evaluate the model on the test set
rf_mae_test = mean_absolute_error(y_test, predictions_test)
print("Random Forest Test MAE:", rf_mae_test)

Random Forest Test MAE: 76558.3934641126


In [63]:
# Make predictions on the training set
predictions_train = rf_pipeline.predict(X_train)

In [64]:
test_data = {"Actual": y_test,
        "Predicted": predictions_test}
test_df = pd.DataFrame(test_data)

test_df["ratio"] = abs (test_df["Actual"] - test_df["Predicted"]) / test_df["Actual"]
test_df["Difference"] = abs(test_df["Actual"] - test_df["Predicted"])


train_data = {"Actual": y_train,
        "Predicted": predictions_train}
train_df = pd.DataFrame(train_data)

train_df["ratio"] = abs (train_df["Actual"] - train_df["Predicted"]) / train_df["Actual"]
train_df["Difference"] = abs(train_df["Actual"] - train_df["Predicted"])

comparison_df = pd.concat([test_df, train_df], axis = 0)

og_comparison_df = pd.concat([comparison_df, listings_df], axis = 1)

og_comparison_df.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,price,baths,beds,dens,neighbourhood,city,property_type,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
W8018028,625000,596297.5,0.045924,28702.504762,625000,1,1,1,Islington-City Centre West,Toronto,condo_apartment,43.643148,-79.527023,0.643148,-0.527023
C7240366,1049000,947847.7,0.096427,101152.316876,1049000,2,3,0,Willowdale East,Toronto,condo_apartment,43.777961,-79.414241,0.777961,-0.414241
W7254918,589900,561537.4,0.04808,28362.636,589900,1,1,1,Stonegate-Queensway,Toronto,condo_apartment,43.638218,-79.489363,0.638218,-0.489363
W7294830,1395000,1336915.0,0.041638,58085.056,1395000,4,3,1,Junction Area,Toronto,freehold_townhome,43.670984,-79.465552,0.670984,-0.465552
C7358908,619900,697596.5,0.125337,77696.460095,619900,1,1,1,Moss Park,Toronto,condo_apartment,43.647552,-79.392448,0.647552,-0.392448


In [65]:
sorted_comparison = og_comparison_df.sort_values(by="ratio", ascending=False)
sorted_comparison.head()

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,price,baths,beds,dens,neighbourhood,city,property_type,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E7387572,699000,1111590.0,0.590257,412589.912,699000,1,2,0,Birchcliffe-Cliffside,Toronto,detached_home,43.706633,-79.252523,0.706633,-0.252523
W7300894,849000,1339292.0,0.577493,490291.67,849000,2,3,0,New Toronto,Toronto,detached_home,43.606302,-79.507441,0.606302,-0.507441
C7272710,399000,619693.7,0.553117,220693.747111,399000,1,1,0,Waterfront Communities C1,Toronto,condo_apartment,43.642037,-79.381615,0.642037,-0.381615
C8029808,548000,796064.8,0.452673,248064.7844,548000,1,2,0,Humewood-Cedarvale,Toronto,condo_apartment,43.693708,-79.422769,0.693708,-0.422769
E8017516,575000,830512.8,0.44437,255512.780571,575000,2,1,0,South Riverdale,Toronto,condo_townhome,43.666422,-79.344712,0.666422,-0.344712


In [66]:
# Neighbourhood ratios (margin of error)
neigh_comparison = og_comparison_df.groupby("neighbourhood")

neigh_comparison["ratio"].mean().sort_values(ascending=False)

neighbourhood
New Toronto                0.134948
O'Connor-Parkview          0.127962
Weston-Pellam Park         0.126589
Cliffcrest                 0.121584
Rouge E11                  0.120771
                             ...   
York University Heights    0.028236
Bendale                    0.025901
Kensington-Chinatown       0.024942
Regent Park                0.020295
Yonge-Eglinton             0.019805
Name: ratio, Length: 80, dtype: float64

In [67]:
sorted_comparison.loc[sorted_comparison["neighbourhood"] == "Corso Italia-Davenport"]

Unnamed: 0_level_0,Actual,Predicted,ratio,Difference,price,baths,beds,dens,neighbourhood,city,property_type,latitude,longitude,rel_latitude,rel_longitude
mls_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


In [68]:
sorted_comparison[["price", "beds", "dens"]].max()

price    1999000
beds           4
dens           2
dtype: int64