In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import csv
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Model, GPT2Tokenizer, AdamW

In [None]:
with open('/content/drive/MyDrive/2023_residential_description.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    data_2023 = [row for row in reader]
concat_col_2023 = np.array([f"{row[0]} {row[1]} The sold price is " for row in data_2023])

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

class ResiDataset(Dataset):
  def __init__(self, sentences, target, tokenizer):
    self.sentences = sentences
    self.target = target
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self, idx):
    encoding = self.tokenizer.encode_plus(self.sentences[idx], add_special_tokens=True, max_length=256, padding='max_length', pad_to_max_length=True,
                        return_attention_mask=True, truncation=True, return_tensors='pt')
    input = encoding['input_ids'].squeeze()
    attention_mask = encoding['attention_mask'].squeeze()
    target = torch.tensor(self.target[idx], dtype=torch.long)
    return {'input_ids': input, 'attention_mask': attention_mask, 'target': target}

In [None]:
gpt2 = GPT2Model.from_pretrained("gpt2")
new_vocab_size = gpt2.config.vocab_size + 1 ## for <pad>
new_embed = nn.Embedding(new_vocab_size, gpt2.config.hidden_size)
new_embed.weight.data[:gpt2.config.vocab_size, :] = gpt2.wte.weight.data
gpt2.set_input_embeddings(new_embed)
class GPTModel(nn.Module):
  def __init__(self, num_classes, gpt2):
    super(GPTModel, self).__init__()
    self.gpt_model = gpt2
    self.fc = nn.Linear(gpt2.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
    gpt_out = self.gpt_model(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output_state = gpt_out.last_hidden_state.mean(dim=1)
    return self.fc(pooled_output_state)

In [None]:
gpt_model = GPTModel(801, gpt2)
gpt_model.load_state_dict(torch.load('/content/drive/MyDrive/GPTModel.pth'))

In [None]:
for param in gpt_model.parameters():
  param.requires_grad = False

In [None]:
def get_gpt_feature(model, data):
  encoding = tokenizer.encode_plus(data, add_special_tokens=True, max_length=256, padding='max_length', pad_to_max_length=True,
                      return_attention_mask=True, truncation=True, return_tensors='pt')
  input = encoding['input_ids'].squeeze()
  attention_mask = encoding['attention_mask'].squeeze()
  gpt_out = gpt_model.gpt_model(input_ids=input, attention_mask=attention_mask)
  return gpt_out.last_hidden_state.mean(dim=1)

## MLP

In [None]:
import time
import json
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sn
# for visualizing missing values
import missingno as msno
# from IPython.display import display
# import dataframe_image as dfi

# for display plot inline
# %matplotlib inline
# change the style
matplotlib.style.use('ggplot')

from sklearn.metrics import mean_absolute_error, mean_squared_error

import argparse
import joblib

In [None]:
def eval_metrics(model_pred, target):

    mae = mean_absolute_error(model_pred, target)
    mse = mean_squared_error(model_pred, target, squared=True)

    x = {
        "pred": model_pred,
        "pred_y": target
    }
    perc = np.abs(model_pred - target)/target
    median = perc.median()
    count_5 = perc[perc <= 0.05].count() / model_pred.size if model_pred.size else 0
    count_10 = perc[perc <= 0.10].count() / model_pred.size if model_pred.size else 0
    count_20 = perc[perc <= 0.20].count() / model_pred.size if model_pred.size else 0

    # print(f"{mae:.2f}")
    # # print(f"{mse:.2f}")
    # print(f"{np.sqrt(mse):.2f}")
    # print("median:", median)
    # print("<= 5%:", count_5)
    # print("<= 10%:", count_10)
    # print("<= 20%:", count_20)

    return {
        "mae": mae,
        "mse": mse,
        "rmse": np.sqrt(mse),
        "median": median,
        "count_5": count_5,
        "count_10": count_10,
        "count_20": count_20
    }

def split_data(data, valid_perc=None, test_perc=None):
    """
    Using np.split:
    https://numpy.org/doc/stable/reference/generated/numpy.split.html
    Assuming the dataset is sort by sold date desc. Using the latest for test, then for validation.
    """

    if test_perc and valid_perc:
        data_test, data_validate, data_train = np.split(data, [int(test_perc*len(data)), int((valid_perc+test_perc)*len(data))])
    elif valid_perc:
        data_validate, data_train = np.split(data, [int(valid_perc*len(data))])
        data_test = None
    else:
        data_train = data
        data_validate = None
        data_test = None


    data_train_x = data_train.drop("Sp_dol", axis=1)
    data_train_y = data_train["Sp_dol"]

    if valid_perc:
        data_validate_x = data_validate.drop("Sp_dol", axis=1)
        data_validate_y = data_validate["Sp_dol"]

    if test_perc:
        data_test_x = data_test.drop("Sp_dol", axis=1)
        data_test_y = data_test["Sp_dol"]


    print("     all data:", data.shape)
    print(   "train data:", data_train.shape)

    if valid_perc:
        print("validate data:", data_validate.shape)

    if test_perc:
        print("    test data:", data_test.shape)

    result = {
        # "train": data_train,
        "train_x": data_train_x,
        "train_y": data_train_y
    }

    if valid_perc:
        result.update({
            # "validate": data_validate,
            "validate_x": data_validate_x,
            "validate_y": data_validate_y,
        })

    if test_perc:
        result.update({
            # "test": data_test,
            "test_x": data_test_x,
            "test_y": data_test_y,
        })


    return result


def predict_result(data, target, predict, save_path=None):
    """
    example usage:
    result_res = predict_result(model_res, split_data_res["validate_x"], split_data_res["validate_y"], "2023_res_validate_result.csv")
    """

    # pred = my_model.predict(d_x)
    d_x = data
    pred = predict
    d_y = target

    try:
        d_x_index = d_x.index
    except:
        d_x_index = list(range(d_x.shape[0]))

    output = pd.DataFrame({'Ml_num': d_x_index, 'predict': pred, "actual":d_y, "diff": pred - d_y, "diff_perc": round(np.abs(pred - d_y)/d_y, 4)})

    if save_path:
        output.to_csv(save_path, index=True)

    return output


def display_worst_prediction(data, target, predict, name="", topk=10):

    # pred = my_model.predict(d_x)
    # pred_y = d_y
    d_x = data
    pred = predict
    pred_y = target

    try:
        d_x_index = d_x.index
    except:
        d_x_index = list(range(d_x.shape[0]))

    pred = pd.DataFrame({"pred":pred, "Ml_num":d_x_index, "pred_y":pred_y})
    pred.set_index("Ml_num", inplace=True)

    pred["diff"] = np.abs(pred["pred"]-pred["pred_y"]) / pred["pred_y"]

    res = pred[["pred", "pred_y", "diff"]].sort_values(by=["diff"],ascending=False) \
            .head(topk).rename(columns={"pred": "Prediction", "pred_y": "Sale Price", "diff":"Different Percentage"}) \
            .style.format({"Prediction":"{:,.0f}", "Sale Price":"{:,.0f}", "Different Percentage":"{:,.2%}"}) \
            .set_table_styles([{
                 'selector': 'caption',
                 'props': 'font-weight:bold;font-size:1.25em;'
             }], overwrite=False) \
            .set_caption(f"Top {topk} Worst Predict Result of Listings" + ("" if not name else f"<br>{name}"))
            # .set_caption(f"Worst Predict Result of Listings<br>({data_name} set)"))


    # print("Total predicted:", pred.shape)
    # print("Total predicted with difference > 0.5:", pred[pred["diff"]>0.5].shape)

    return res


def display_predict_result(data, target, predict, name="", group_by="Area", sort_by="Homes", ascending=False):


    # pred = my_model.predict(d_x)
    # pred_y = d_y
    d_x = data
    pred = predict
    pred_y = target

    # group_by = "Area" #"Area" # "Municipality_district" # "S_r"
    # sort_by = "Homes" # "Municipality_district" # "Homes"
    # ascending = False

    try:
        d_x_index = d_x.index
    except:
        d_x_index = list(range(d_x.shape[0]))

    pred = pd.DataFrame({"pred":pred, "Ml_num":d_x_index, "pred_y":pred_y})
    pred = pd.concat([pred, d_x[[group_by]]], axis=1, join='inner')
    pred.set_index("Ml_num", inplace=True)


    def calculation(x):
        perc = np.abs(x["pred"] - x["pred_y"])/x["pred_y"]
        median = perc.median()
        count_5 = perc[perc <= 0.05].count() / x["pred"].size if x["pred"].size else 0
        count_10 = perc[perc <= 0.10].count() / x["pred"].size if x["pred"].size else 0
        count_20 = perc[perc <= 0.20].count() / x["pred"].size if x["pred"].size else 0

        # print(x.shape) <= (15, 3)
        # print(x.size) <= 45
        # print(x.size.astype(int)) <= 45

        res = {'Median Error': median, 'Within 5% of Sales Price': count_5, 'Within 10% of Sales Price': count_10, 'Within 20% of Sales Price': count_20, "Homes":x["pred"].size}
        return pd.Series(res, index=res.keys())


    # TODO: why groupby contains empty dataframe, a workaround right now is prevent zero division in calculation()
    result = pred.groupby([group_by]).apply(calculation)
    result.loc["All Areas"] = calculation(pred)

    result = result.sort_values(by=sort_by, ascending=ascending)
    result_style = result.style.format({'Median Error': "{:.2%}",'Within 5% of Sales Price': "{:.2%}",'Within 10% of Sales Price': "{:.2%}",'Within 20% of Sales Price': "{:.2%}",'Homes': "{:,.0f}"}) \
                    .set_table_styles([{
                                 'selector': 'caption',
                                 'props': 'font-weight:bold;font-size:1.25em;'
                             }], overwrite=False) \
                    .set_caption(name)
    # display(result_style)
    # result[['Median Error','Within 5% of Sales Price','Within 10% of Sales Price','Within 20% of Sales Price']] = result[['Median Error','Within 5% of Sales Price','Within 10% of Sales Price','Within 20% of Sales Price']].applymap('{:.2%}'.format)
    # result['Homes'] = result['Homes'].apply('{:,.0f}'.format)
    # result.dfi.export(f"2023_res_validate_result.png")

    return result_style

In [None]:
def handle_lot_size(x):

    if x["Front_ft"] == 0:
        val = x["Depth"]
    elif x["Depth"] == 0:
        val = x["Front_ft"]
    else:
        val = x["Front_ft"] * x["Depth"]

    if not val:
        return val

    # convert all units to Feet
    if x["Lotsz_code"] == "Feet":
        return val
    elif x["Lotsz_code"] == "Metres":
        return val * 3.28084
    elif x["Lotsz_code"] == "Acres":
        return val * 43560
    else:
        return val


def preprocessing(data):

    data = data.dropna(subset=['Sp_dol'])

    # convert fields to boolean
    #   "Den_fr": "Family Room - not NA: 47287/50026 - 94.52% - ['N', 'Y']",
    data['custom_den_fr'] = data['Den_fr'].apply(lambda x: True if x=='Y' else False)
    data['Taxes'] = data['Taxes'].apply(lambda x: None if x==0 else x)
    data['Tv'] = data['Tv'].apply(lambda x: None if x==0 else x)
    data['custom_tour_url'] = data['Tour_url'].apply(lambda x: True if type(x) == str and x.strip() else False)
    data['custom_fpl_num'] = data['Fpl_num'].apply(lambda x: True if x=='Y' else False)

    data['Lat'] = data['Lat'].apply(lambda x: None if x==0 else x)
    data['Lng'] = data['Lng'].apply(lambda x: None if x==0 else x)

    # new fields for special handle fields
    data["custom_lot_size"] = data.apply(lambda x:handle_lot_size(x), axis=1)

    return data


def convert_datatype(data):
    """
    This supposes split all features into each datatype, and select by user later. But we only pick
    those we may care about.
    """

    numerics_int_res = [
        "Photo_count",
        "Bath_tot", "Br", "Br_plus", "Rms", "Rooms_plus", "Kit_plus", "Num_kit",
        "Gar_spaces", "Park_spcs",
        # "Lp_dol",
        "Sp_dol", # target
    ]

    numerics_float_res = [
        "Lat", "Lng",
        # "Taxes",
        "custom_lot_size",
    ]

    dates_res = ["Input_date"] # "Input_date" makes it worse, should we shuffle the data? Right now it's sorted by Cd (sold date)

    bools_res = ["custom_den_fr", "custom_tour_url", "custom_fpl_num"]

    categories_res = [
        "Comp_pts", # unique: 4
        "Constr1_out", "Constr2_out", # todo: they represent the same thing, (e.g. A,B = B,A) # unique: 14
        "Bsmt1_out", "Bsmt2_out", # todo: they represent the same thing, (e.g. A,B = B,A) # unique: 14
        "Yr_built", # ['0-5', '100+', '16-30', '31-50', '51-99', ...] unique: 7
        "Acres", # unique: 9
        "Sqft", # unique: 9
        "Style", # unique: 17
        "Type_own1_out", # unique: 17
        # "Spec_des1_out", # unique 6, though almost all are Unknown (46070 over 50026)
        "Area", # unique 7 (we restricted the records to 7 areas)
        "Municipality_district", # unique 86 (within these 7 areas)
        "Community", # unique 579 (within these 7 areas)
    ] # worse: "Sewer", "Heating",

    # features_res = numerics_float_res + numerics_int_res + bools_res + categories_res + dates_res
    # print("features:", len(features_res))

    # TODO: handle: Zip?, Input_date?

    for num in numerics_float_res:
        data[num] = data[num].fillna(0).astype(float)

    for num in numerics_int_res:
        data[num] = data[num].fillna(0).round().astype('int64')


    for category in categories_res:
        data[category] = data[category].astype("category")

    for d in dates_res:
        # data[d] = data[d].apply(lambda x: x.Timestamp.value)
        # data[d] = data[d].dt.strftime("%Y%m%d").astype(int)
        data[d] = data[d].str.replace("-","").fillna(0).astype(int)

    return data


def export_data(data, config):

    feature_lis = []
    for k,v in config["features"].items():
        feature_lis.extend(v)

    data = data[feature_lis]

    if "testset_percentage" in config and config["testset_percentage"]:
        data_test, data_train = np.split(data, [int(config["testset_percentage"]*len(data))])
        data_train.to_csv(config["save_path"])
        data_test.to_csv(config["save_path"].replace(".csv", "_test.csv"))
        print(f"Training size (without index): {data_train.shape}")
        print(f"Test size (without index): {data_test.shape}")
    else:
        data.to_csv(config["save_path"])
        print(f"Training size (without index): {data.shape}")

    # save metadata
    with open(config["save_path"].replace(".csv", ".json"), "w", encoding="utf-8") as f:
        f.write(json.dumps(config, indent=2, ensure_ascii=False))


def select_features(dataset, metadata):
    numerics_int = metadata["features"]["integer"]
    numerics_float = metadata["features"]["float"]
    numerics_bool = metadata["features"]["boolean"]
    categories = metadata["features"]["category"]

    for num in numerics_float:
        dataset[num] = dataset[num].fillna(0).astype(float)

    for num in numerics_int:
        dataset[num] = dataset[num].fillna(0).round().astype('int64')

    for num in numerics_bool:
        dataset[num] = dataset[num].astype('bool')

    for category in categories:
        dataset[category] = dataset[category].astype("category")

    features = []
    for k,v in metadata["features"].items():
        features.extend(v)


    return dataset[features]

In [None]:
with open('/content/drive/MyDrive/features.json', 'r') as f:
  features = json.load(f)
mlp_model = torch.jit.load("/content/drive/MyDrive/mlp_model.pt")
mlp_pipeline = joblib.load("/content/drive/MyDrive/pipeline.pkl")

df = pd.read_csv("/content/drive/MyDrive/2023_Residential_extra.csv")
df.set_index('Ml_num', inplace = True)
df = df.sort_values(by='Cd',ascending=False)

df = preprocessing(df)
df = convert_datatype(df)
df = select_features(df, features)

train_x = df.drop("Sp_dol", axis=1)
train_y = df["Sp_dol"]


In [None]:
def get_mlp_feature(model, train_x, pipeline):
  a = pipeline.transform(train_x) # train_x here is one data row
  a = torch.tensor(a.toarray(), dtype=torch.float32)
  model.eval()
  model.to(device)
  return model.features(a.to(device))

In [None]:
class FinalMLP(nn.module):
  def __init__(self, concat_feature_len, num_hidden):
    self.fc1 = nn.Linear(concat_feature_len, 256)
    self.fc2 = nn.Linear(256, 64)

  def forward(self, gpt_feature, mlp_feature, cnn_feature):
    concat_feature = np.concatenate((gpt_feature, mlp_feature, cnn_feature))
    h1 = self.fc1(concat_feature)
    z1 = torch.relu(h1)
    return self.fc2(z1)