In [34]:
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm

import torch

from torchvision import transforms
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import get_scheduler
from sklearn.metrics import PredictionErrorDisplay


from jre_utils.datapath import model_ready_data_paths, model_output_data_paths
from jre_utils.data import JapanRETimeSeriesDataset, PadAndMask, ToNumpy, ToTensor
from jre_utils.models import TimeSeriesTransformerModel
from jre_utils.metrics import MSELossWeighted
from jre_utils.engine import (
    evaluate,
    train,
    evaluate_weighted,
    train_weighted,
    EarlyStopper,
)


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [35]:
metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key_unsmoothed = "median"
metric_unsmoothed = metrics[metric_key_unsmoothed]

metric_key = f"{metric_key_unsmoothed}_smoothed"
metric = metrics[metric_key]

metric_pct_chg = metric + "_pct_chg"
normalized_metric_pct_chg = metric_pct_chg + "_normalized_yearly"

In [36]:
start_year = 2006
eval_start_year = 2020 # eval_years = [2020, 2021, 2022]
eval_end_year = 2022

dataset_key = "transactions"
years_ahead = 2
dataset_name = f"sequence_{dataset_key}_{metric_key}_{years_ahead}"
output_dataset_name = f"{dataset_name}_{eval_start_year}"
model_ready_data_path = model_ready_data_paths[dataset_name]
model_output_data_path = model_output_data_paths[output_dataset_name]

df = pd.read_csv(model_ready_data_path)
df = df[df["year"] <= eval_end_year]
df = df.sample(frac=1).reset_index(drop=True)
df = df.sort_values(by=["year"]).reset_index(drop=True)

df["count_scaled"] = df["count"].apply(lambda x: 1 + np.log10(x))

train_df = df[(df["year"] >= start_year) & (df["year"] < eval_start_year)].reset_index(drop=True)
eval_df = df[df["year"] >= eval_start_year].reset_index(drop=True)

In [37]:
train_df[train_df["area_code"] == 13101][["year", metric, metric_pct_chg, normalized_metric_pct_chg]]

Unnamed: 0,year,unit_price_median_smoothed,unit_price_median_smoothed_pct_chg,unit_price_median_smoothed_pct_chg_normalized_yearly
2,2007,2204983.0,0.227171,2.619291
28,2008,2197454.0,0.139858,1.066988
766,2009,2069647.0,-0.061378,-0.060949
2029,2010,1825045.0,-0.169473,-0.456003
2358,2011,1620211.0,-0.217156,-0.780465
5087,2012,1684720.0,-0.076888,-0.332538
5602,2013,1947510.0,0.20201,1.11335
7155,2014,2130575.0,0.264646,1.446387
8667,2015,2257352.0,0.159097,0.843063
10020,2016,2645833.0,0.24184,1.197871


In [38]:
eval_df[eval_df["area_code"] == 13101][["year", metric, metric_pct_chg, normalized_metric_pct_chg]]

Unnamed: 0,year,unit_price_median_smoothed,unit_price_median_smoothed_pct_chg,unit_price_median_smoothed_pct_chg_normalized_yearly
1147,2020,3849920.0,0.257623,1.115681
2191,2021,4164547.0,0.225359,0.829393
3158,2022,4462737.0,0.159177,0.495757


In [39]:
idx = 3158
row = eval_df.iloc[idx]
target = row[normalized_metric_pct_chg]
area_code, year = row["area_code"], row["year"]
window = (
    df[
        (df["area_code"] == area_code)
        & (df["year"] <= year - 2)
    ]
    .sort_values(by="year")
    .tail(5)
)
print(f"Target: {target}")
window

Target: 0.4957566755770115


Unnamed: 0,unit_price_median_smoothed_pct_chg,unit_price_median_smoothed,unit_price_median,year,years_since_crisis,count,total_traded_area,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,new_dwellings,existing_dwellings,net_migration_ratio,new_dwellings_ratio,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,area_code,area,new_dwellings_ratio_normalized_yearly,log_new_dwellings_ratio,log_new_dwellings_ratio_normalized_yearly,unit_price_median_smoothed_log,unit_price_median_smoothed_log_normalized_yearly,count_log,count_log_normalized_yearly,total_traded_area_log,total_traded_area_log_normalized_yearly,population_log,population_log_normalized_yearly,taxpayer_count_log,taxpayer_count_log_normalized_yearly,taxable_income_log,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log,total_tax_log_normalized_yearly,new_dwellings_log,new_dwellings_log_normalized_yearly,existing_dwellings_log,existing_dwellings_log_normalized_yearly,unit_price_median_smoothed_pct_chg_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,unit_price_median_smoothed_normalized_yearly,count_normalized_yearly,total_traded_area_normalized_yearly,population_normalized_yearly,taxpayer_count_normalized_yearly,taxable_income_normalized_yearly,taxable_income_per_taxpayer_normalized_yearly,new_dwellings_normalized_yearly,existing_dwellings_normalized_yearly,total_tax_normalized_yearly,migrations_is_available_normalized_yearly,taxable_income_is_available_normalized_yearly,dwellings_is_available_normalized_yearly,total_tax_is_available_normalized_yearly,count_scaled
10020,0.24184,2645833.0,2645833.0,2016,8,58.0,9205.0,60870.0,34324.0,314359478.0,9158.591015,0.123044,0.079495,18441614.0,0.057706,1545.0,38740.0,0.014473,0.039881,1,1,1,1,13101,Tokyo-to Chiyoda-ku,3.645084,-0.399231,2.260922,7.422562,3.774222,2.763428,0.523342,4.964024,0.038018,5.784403,0.380994,5.535598,0.579343,9.497427,1.416775,4.961829,7.516626,8.265799,1.001951,4.188928,0.953829,5.58816,0.043625,1.197871,1.242587,3.332668,2.801374,2.891238,16.231302,-0.112194,-0.27092,-0.141833,-0.066958,0.498455,10.691254,0.441195,-0.015471,0.120489,0.0,0.06613,0.836356,0.026948,2.763428
10886,0.264897,2855318.0,3937500.0,2017,9,48.0,7930.0,61751.0,35326.0,333664476.0,9445.294571,0.061411,0.031304,19084096.0,0.034839,1415.0,40285.0,0.017522,0.035125,1,1,1,1,13101,Tokyo-to Chiyoda-ku,3.521004,-0.454387,2.18768,7.455655,3.798386,2.681241,0.388403,4.899273,-0.096657,5.790644,0.394623,5.548094,0.593036,9.52331,1.445119,4.975216,7.684541,8.280672,1.018779,4.150756,0.915407,5.605143,0.06987,1.325756,0.600536,1.082507,0.66526,3.529738,16.441401,-0.162684,-0.309082,-0.137364,-0.061498,0.531391,11.0239,0.386022,-0.00716,0.131257,0.0,0.06613,0.836356,0.026948,2.681241
12596,0.157014,3061267.0,3061189.0,2018,10,50.0,8190.0,62833.0,36299.0,362690825.0,9991.758037,0.086993,0.057856,19816187.0,0.038361,978.0,41700.0,0.032212,0.023453,1,1,1,1,13101,Tokyo-to Chiyoda-ku,1.962634,-0.629797,1.385822,7.485901,3.808761,2.69897,0.430708,4.913284,-0.034414,5.798188,0.410692,5.559895,0.607849,9.559537,1.490338,4.999642,7.906475,8.29702,1.035597,3.990339,0.622948,5.620136,0.092997,0.792784,0.767911,1.390629,1.050529,6.184275,17.107678,-0.147566,-0.301729,-0.13197,-0.055703,0.583629,11.587289,0.154911,-0.000265,0.122456,0.0,0.06613,0.835103,0.0,2.69897
14219,0.190282,3398634.0,3369318.0,2019,0,48.0,8015.0,64857.0,38175.0,412894018.0,10815.822344,0.138419,0.082474,21648748.0,0.092478,787.0,42487.0,0.028108,0.018523,1,1,1,1,13101,Tokyo-to Chiyoda-ku,1.609337,-0.732281,1.152682,7.531304,3.852696,2.681241,0.411331,4.903904,-0.052764,5.811957,0.438657,5.581779,0.64164,9.615839,1.569186,5.03406,8.218735,8.335433,1.092918,3.895975,0.506015,5.628256,0.099687,0.999511,2.562201,3.308448,2.268437,5.265364,18.15421,-0.152671,-0.304405,-0.122051,-0.041034,0.68788,12.233268,0.099425,0.001371,0.152989,0.0,0.06613,0.835103,0.0,2.681241
15880,0.257623,3849920.0,3787942.0,2020,1,30.0,3550.0,66680.0,39873.0,400984266.0,10056.536152,-0.028845,-0.070201,20573851.0,-0.049652,1159.0,43646.0,0.014607,0.026555,1,1,1,1,13101,Tokyo-to Chiyoda-ku,3.235951,-0.575861,2.07475,7.585452,3.896217,2.477121,0.000391,4.550228,-0.789141,5.823996,0.463495,5.600679,0.672562,9.603127,1.539504,5.002448,7.945012,8.313316,1.061534,4.064083,0.882594,5.639944,0.11621,1.115681,-1.299834,-1.115713,-2.877245,3.201692,19.194516,-0.267504,-0.439973,-0.113077,-0.02704,0.645654,11.51037,0.348051,0.006594,0.134451,0.0,0.06613,0.835103,0.0,2.477121


In [40]:
log_normalize_columns = []

normalize_columns = [
    metric,
    metric_pct_chg,
    "count",
    "total_traded_area",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "new_dwellings",
    "existing_dwellings",
    "new_dwellings_ratio",
    "net_migration_ratio",
    "total_tax",
    "total_tax_growth",
]

maintain_columns = [
    metric_pct_chg,
    "years_since_crisis",
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
]

id_columns = [
    "area_code",
    "area",
    "year",
]

feature_columns = (
    [f"{column}_log_normalized_yearly" for column in log_normalize_columns]
    + [f"{column}_normalized_yearly" for column in normalize_columns]
    + maintain_columns
)

final_columns = id_columns + feature_columns

In [17]:
time_series_dataset = JapanRETimeSeriesDataset(
    df,
    train_df,
    metrics=[normalized_metric_pct_chg],
    weight_column="count_scaled",
    feature_columns=feature_columns,
    shift=years_ahead
)

time_series_dataset_transformed = JapanRETimeSeriesDataset(
    df,
    train_df,
    metrics=[normalized_metric_pct_chg],
    weight_column="count_scaled",
    feature_columns=feature_columns,
    transform=transforms.Compose([ToNumpy(), PadAndMask(), ToTensor()]),
    shift=years_ahead
)

for i, sampleX in enumerate(time_series_dataset):
    print(
        i,
        sampleX["window"].shape,
        sampleX["target"].shape,
        sampleX["target"].values,
        sampleX["weight"].values,
    )
    if i == 3:
        break


for i, sampleY in enumerate(time_series_dataset_transformed):
    print(
        i,
        sampleY["window"].shape,
        sampleY["target"].shape,
        sampleY["target"],
        sampleY["weight"],
    )
    if i == 3:
        break

0 (0, 22) (1,) [0.81251324] [2.88081359]
1 (0, 22) (1,) [1.02185667] [3.26007139]
2 (0, 22) (1,) [-0.21269522] [3.11058971]
3 (0, 22) (1,) [-0.74802854] [2.39794001]
0 torch.Size([5, 22]) torch.Size([1]) tensor([0.8125]) tensor([2.8808])
1 torch.Size([5, 22]) torch.Size([1]) tensor([1.0219]) tensor([3.2601])
2 torch.Size([5, 22]) torch.Size([1]) tensor([-0.2127]) tensor([3.1106])
3 torch.Size([5, 22]) torch.Size([1]) tensor([-0.7480]) tensor([2.3979])


In [18]:
BATCH_SIZE = 256

In [19]:
train_dataset = JapanRETimeSeriesDataset(
    df,
    train_df,
    feature_columns=feature_columns,
    metrics=[normalized_metric_pct_chg],
    weight_column="count_scaled",
    transform=transforms.Compose([ToNumpy(), PadAndMask(), ToTensor()]),
)
eval_dataset = JapanRETimeSeriesDataset(
    df,
    eval_df,
    feature_columns=feature_columns,
    metrics=[normalized_metric_pct_chg],
    weight_column="count_scaled",
    transform=transforms.Compose([ToNumpy(), PadAndMask(), ToTensor()]),
)

train_dataloader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0
)

eval_dataloader = DataLoader(
    eval_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0
)

In [20]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18826 entries, 0 to 18825
Data columns (total 44 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   unit_price_median_smoothed_pct_chg                    18826 non-null  float64
 1   unit_price_median_smoothed                            18826 non-null  float64
 2   unit_price_median                                     18826 non-null  float64
 3   year                                                  18826 non-null  int64  
 4   years_since_crisis                                    18826 non-null  int64  
 5   count                                                 18826 non-null  float64
 6   total_traded_area                                     18826 non-null  float64
 7   population                                            18826 non-null  float64
 8   taxpayer_count                                        18

# Building the Transformer Model

In [21]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
# device = "cpu"
print(f"Using {device} device")

Using mps device


In [22]:
n_features = len(feature_columns)
d_model = 256
d_hid = 256
nlayers = 8
nhead = 8
dropout = 0
enc_dropout = 0

model = TimeSeriesTransformerModel(
    n_features=n_features,
    d_model=d_model,
    nhead=nhead,
    d_hid=d_hid,
    nlayers=nlayers,
    dropout=dropout,
    enc_dropout=enc_dropout,
    device=device,
)
model = model.to(device)


In [23]:
model.eval()

mse_loss_weighted = MSELossWeighted().to(device)

with torch.no_grad():
    for batch in eval_dataloader:
        window = batch["window"].to(device)
        mask = batch["mask"].to(device)
        target = batch["target"].to(device)
        weight = batch["weight"].to(device)

        outputs = model(window, mask)
        loss = mse_loss_weighted(outputs, target, weight)

        print(
            batch["window"].shape,
            batch["mask"].shape,
            batch["target"].shape,
            batch["weight"].shape,
        )

        print(outputs.shape)
        
        print("Loss:", loss.item())

        break

torch.Size([256, 5, 22]) torch.Size([256, 5]) torch.Size([256, 1]) torch.Size([256, 1])
torch.Size([256, 1])
Loss: 3.3871610164642334


In [24]:
learning_rate = 1e-4 # 3e-4
weight_decay = 1 # 1
num_epochs = 30

In [25]:
num_training_steps = num_epochs * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

lr_scheduler = get_scheduler(
    "linear", # constant
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

train_losses, train_r2_scores = [], []
eval_losses, eval_r2_scores = [], []

In [26]:
# progress_bar = tqdm(range(num_training_steps))
progress_bar = None

for epoch in range(num_epochs):
    train_loss, train_r2_score = train_weighted(
        model, train_dataloader, optimizer, lr_scheduler, progress_bar, device=device
    )
    train_losses.append(train_loss)
    train_r2_scores.append(train_r2_score)

    eval_loss, eval_r2_score = evaluate_weighted(model, eval_dataloader, device=device)
    eval_losses.append(eval_loss)
    eval_r2_scores.append(eval_r2_score)
    
    print(f"Epoch: {epoch}")
    print(f"Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}")
    print(f"Train R^2: {train_r2_score:.4f}, Eval R^2: {eval_r2_score:.4f}")

# progress_bar.close()

Epoch: 0
Train Loss: 8.8429, Eval Loss: 8.5022
Train R^2: -0.0357, Eval R^2: 0.0100
Epoch: 1
Train Loss: 7.0957, Eval Loss: 12.6887
Train R^2: 0.1660, Eval R^2: -0.4529


KeyboardInterrupt: 

In [None]:
# Plot MSE
plt.plot(train_losses, label = "train")
plt.plot(eval_losses, label = "eval")
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('loss over epochs')
plt.legend()
plt.show()

In [None]:
# Plot R2 scores
plt.plot(train_r2_scores, label = "train")
plt.plot(eval_r2_scores, label = "eval")
plt.xlabel('epoch')
plt.ylabel('r2 score')
plt.title('r2 scores over epochs')
plt.legend()
plt.show()