In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
from itertools import product
import math
import time
import json
import os
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

from helper_functions import utils
from IPython.display import display
import dataframe_image as dfi
import joblib # for save pipeline

# for display plot inline
%matplotlib inline
# change the style
matplotlib.style.use('ggplot')

import time
import json
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sn
# for visualizing missing values
import missingno as msno
from pathlib import Path
import os
import dataframe_image as dfi
import joblib # for save xgboost model

from helper_functions import utils
from helper_functions import preprocessing

# How enable_cagegorical works: https://xgboost.readthedocs.io/en/stable/tutorials/categorical.html
from xgboost import XGBRegressor
from IPython.display import display
import dataframe_image as dfi

from sklearn.metrics import mean_absolute_error, mean_squared_error

# for display plot inline
%matplotlib inline
# change the style
matplotlib.style.use('ggplot')


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [2]:
dataset_f = './data/2024-01_Residential_extra.csv'
features_f = './features.json'
output_dir = "./results/"

model_location = "./models/train_2023/"

mlp_pipeline = joblib.load(os.path.join(model_location, f"pipeline.pkl"))
mlp = torch.jit.load(os.path.join(model_location, f"mlp_model.pt"))
mlp.eval()

RecursiveScriptModule(
  original_name=MLPModel
  (relu): RecursiveScriptModule(original_name=ReLU)
  (features): RecursiveScriptModule(
    original_name=Sequential
    (0): RecursiveScriptModule(original_name=Linear)
    (1): RecursiveScriptModule(original_name=ReLU)
    (2): RecursiveScriptModule(original_name=Dropout)
    (3): RecursiveScriptModule(original_name=Linear)
    (4): RecursiveScriptModule(original_name=ReLU)
    (5): RecursiveScriptModule(original_name=Dropout)
    (6): RecursiveScriptModule(original_name=Linear)
  )
  (output): RecursiveScriptModule(original_name=Linear)
)

In [3]:
def model_prediction(data_train_x, model, pipeline):
    
    # predict
    X_train_transformed = pipeline.transform(data_train_x)
    X_train_tensor = torch.tensor(X_train_transformed.toarray(), dtype=torch.float32)
    model.eval()
    model.to(device)
    model_pred = model(X_train_tensor.to(device))
    model_pred = model_pred.view(-1).cpu().detach().numpy()

    return model_pred

In [4]:
def select_features(dataset, metadata):
    numerics_int = metadata["features"]["integer"]
    numerics_float = metadata["features"]["float"]
    numerics_bool = metadata["features"]["boolean"]
    categories = metadata["features"]["category"]
    
    for num in numerics_float:
        dataset[num] = dataset[num].fillna(0).astype(float)
        
    for num in numerics_int:
        dataset[num] = dataset[num].fillna(0).round().astype('int64')
    
    for num in numerics_bool:
        dataset[num] = dataset[num].astype('bool')
    
    for category in categories:
        dataset[category] = dataset[category].astype("category")
    
    features = []
    for k,v in metadata["features"].items():
        features.extend(v)
    
        
    return dataset[features]


In [5]:

with open(features_f, "r") as f:
    metadata = json.loads(f.read())

output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(dataset_f)
df.set_index('Ml_num', inplace = True)
df = df.sort_values(by='Cd',ascending=False)

df = preprocessing.preprocessing(df)
df = preprocessing.convert_datatype(df)
df = select_features(df, metadata)

train_x = df.drop("Sp_dol", axis=1)
train_y = df["Sp_dol"]

model_pred = model_prediction(train_x, mlp, mlp_pipeline)



  df = pd.read_csv(dataset_f)


In [6]:
# # test for feature extractor
# a = mlp_pipeline.transform(train_x)
# a = torch.tensor(a.toarray(), dtype=torch.float32)
# mlp.eval()
# mlp.to(device)
# mlp.features(a.to(device))

In [7]:
metrics = utils.eval_metrics(model_pred, train_y)
print(f"mae: {metrics['mae']:.2f}")
print(f"mse: {metrics['mse']:.2f}")
print(f"rmse: {np.sqrt(metrics['mse']):.2%}")
print(f"median: {metrics['median']:.2%}")
print(f"<= 5%: {metrics['count_5']:.2%}")
print(f"<= 10%: {metrics['count_10']:.2%}")
print(f"<= 20%: {metrics['count_20']:.2%}")

mae: 55912.89
mse: 13071930798.67
rmse: 11433254.48%
median: 2.57%
<= 5%: 72.94%
<= 10%: 88.96%
<= 20%: 97.76%




In [8]:
result_res = utils.predict_result(train_x, train_y, model_pred, os.path.join(output_dir, "predict_result.csv"))

In [9]:
style_worst = utils.display_worst_prediction(train_x, train_y, model_pred, "MLP prediction")
display(style_worst)
dfi.export(style_worst, os.path.join(output_dir, 'top_worst_predictions.png'))

Unnamed: 0_level_0,Prediction,Sale Price,Different Percentage
Ml_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N7377274,2461347,500000,392.27%
S7382284,2142497,570000,275.88%
W7332734,66682,1700000,96.08%
C7390224,949823,2236000,57.52%
S6644016,123086,80000,53.86%
W5839376,3606780,2500000,44.27%
N8016912,959918,1403000,31.58%
N8007124,957473,1380000,30.62%
E7379796,1175322,901000,30.45%
E8021558,1050589,1500000,29.96%


In [10]:
style_pred_area = utils.display_predict_result(train_x, train_y, model_pred, name="MLP prediction", group_by="Area", sort_by="Homes", ascending=False)
display(style_pred_area)
dfi.export(style_pred_area, os.path.join(output_dir, 'predictions_by_area.png'))

Unnamed: 0_level_0,Median Error,Within 5% of Sales Price,Within 10% of Sales Price,Within 20% of Sales Price,Homes
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
All Areas,2.57%,72.94%,88.96%,97.76%,2908
Peel,2.43%,78.42%,93.74%,99.34%,607
York,2.51%,68.93%,83.39%,95.89%,560
Toronto,2.89%,66.21%,81.93%,95.28%,509
Durham,2.73%,71.78%,91.08%,99.17%,482
Simcoe,2.63%,74.93%,93.40%,98.15%,379
Halton,2.05%,78.95%,92.26%,99.07%,323
Dufferin,2.82%,77.08%,89.58%,100.00%,48


In [11]:
style_pred_muni = utils.display_predict_result(train_x, train_y, model_pred, name="MLP prediction", group_by="Municipality_district", sort_by="Homes", ascending=False)
display(style_pred_muni)
dfi.export(style_pred_muni, os.path.join(output_dir, 'predictions_by_municipality.png'))

Unnamed: 0_level_0,Median Error,Within 5% of Sales Price,Within 10% of Sales Price,Within 20% of Sales Price,Homes
Municipality_district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
All Areas,2.57%,72.94%,88.96%,97.76%,2908
Brampton,2.19%,80.56%,94.08%,99.15%,355
Mississauga,2.78%,73.94%,93.62%,99.47%,188
Oshawa,3.02%,70.34%,91.72%,97.93%,145
Vaughan,2.36%,76.09%,86.96%,98.55%,138
Barrie,2.60%,74.24%,96.97%,99.24%,132
Markham,4.10%,58.87%,70.16%,92.74%,124
Whitby,2.70%,69.44%,90.74%,99.07%,108
Oakville,2.71%,76.24%,93.07%,100.00%,101
Milton,1.54%,82.83%,88.89%,97.98%,99
