In [69]:
import os
import warnings

import pandas as pd
import numpy as np

from tqdm import tqdm

from jre_utils.datapath import (
    DATA_DIRECTORY_PATH,
)
from jre_utils.config import asset_types

import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [109]:
def area_to_int(area):
    if area == "2,000 m^2 or greater.":
        return 2000
    elif area == "5000 m^2 or greater.":
        return 5000
    else:
        return int(area)


def map_time_units(x):
    mapping = {
        "30-60minutes": 45,
        "1H-1H30": 75,
        "1H30-2H": 105,
        "2H-": 135,
    }

    return int(x) if x.isdigit() else mapping[x]


def map_layout(x):
    if x == "na":
        return "na"

    x = x.split("+")[0]

    if x[0].isdigit() and int(x[0]) == 1:
        if x != "1K" and x != "1LDK" and x != "1DK":
            return "1other"
        return x

    if x[0].isdigit() and int(x[0]) == 2:
        if x != "2LDK" and x != "2DK":
            return "2other"
        return x

    if x[0].isdigit() and int(x[0]) > 2:
        return f"{min(int(x[0]), 5)}LDK"

    return "other"


def map_land_shape(x):
    x = x.lower()
    x = x.replace("semi-", "")
    return x


def map_frontage(x):
    x = str(x)
    if x == "na":
        return 0
    if x == "50.0m or longer.":
        return 55
    return int(x.split(".")[0])


def map_floor_area(x):
    x = str(x)
    if "less" in x:
        return 10
    elif "greater" in x:
        return 2000
    else:
        return int(x.split(".")[0])


def map_year_of_construction(x):
    if x == "before the war":
        return 1930

    return int(x)

In [110]:
import json
from pprint import pprint

subarea_coords_simple_path = f"{DATA_DIRECTORY_PATH}/coordinates/subarea_simple.csv"
coordinates_df = pd.read_csv(subarea_coords_simple_path)
coordinates_df["area_code"] = coordinates_df["area_code"].astype(str)

sub_city_to_city_path = f"{DATA_DIRECTORY_PATH}/core_scraped/sub_city_to_city.json"
with open(sub_city_to_city_path) as fd:
    sub_city_to_city = json.load(fd)
    pprint(f"E.g. Maps 1101 to {sub_city_to_city['1101']}")

area_code_to_area_path = f"{DATA_DIRECTORY_PATH}/core_scraped/area_code_to_area.json"
with open(area_code_to_area_path) as fd:
    area_code_to_area = json.load(fd)
    pprint(f"E.g. Maps 1100 to {area_code_to_area['1100']}")


def get_city_code(area_code):
    return sub_city_to_city.get(area_code, area_code)


def get_area_from_area_code(area_code):
    return area_code_to_area.get(area_code, "na")


'E.g. Maps 1101 to 1100'
'E.g. Maps 1100 to Hokkaido Sapporo-shi'


In [111]:
coordinates_df

Unnamed: 0,area,area_code,subarea,long,lat
0,Wakayama-ken Wakayama-shi,30201,Aoicho,135.163085,34.213480
1,Wakayama-ken Wakayama-shi,30201,Akizuki,135.203712,34.230374
2,Wakayama-ken Wakayama-shi,30201,Akibacho,135.171405,34.200466
3,Wakayama-ken Wakayama-shi,30201,Asahi,135.218759,34.192106
4,Wakayama-ken Wakayama-shi,30201,Azushima,135.229079,34.252869
...,...,...,...,...,...
99081,Tokushima-ken Higashimiyoshi-cho,36489,Keta,133.986142,34.008828
99082,Tokushima-ken Higashimiyoshi-cho,36489,Nakasho,133.972373,33.988013
99083,Tokushima-ken Higashimiyoshi-cho,36489,Nishisho,133.940244,34.008941
99084,Tokushima-ken Higashimiyoshi-cho,36489,Higashiyama,133.855753,34.078389


In [112]:
def process_transactions_df(df):
    df["area_code"] = df["City,Town,Ward,Village code"].astype(str)

    # we may want to skip the following step in the future
    df["area_code"] = df["area_code"].apply(get_city_code).astype(str)
    df["area"] = df["area_code"].apply(get_area_from_area_code)

    df["trade_price"] = df["Transaction-price(total)"]
    df["trade_area"] = df["Area(m^2)"].apply(area_to_int)
    df["unit_price"] = df["Transaction-price(Unit price m^2)"]
    df["trade_price_per_area"] = df["trade_price"] / df["trade_area"]

    df["quarter"] = df["Transaction period"].apply(lambda x: int(x.split(" ")[0][0]))
    df["year"] = df["Transaction period"].apply(lambda x: int(x.split(" ")[2]))

    df["unit_price"] = np.where(
        df["unit_price"].isna(),
        df["trade_price_per_area"],
        df["unit_price"],
    )

    df = df[
        df["Type"].isin(
            [
                asset_types["building"]["label"],
                asset_types["land"]["label"],
                asset_types["condo"]["label"],
            ],
        )
    ]

    # Renaming
    df = df.rename(
        columns={
            "Type": "asset_type",
            "Region": "neighbourhood_classification",
            "Area": "subarea",
            "Nearest stationFName": "nearest_station",
            "Nearest stationFDistance(minute)": "time_to_nearest_station",
            "Layout": "layout",
            "Land shape": "land_shape",
            "Frontage": "frontage",
            "Total floor area(m^2)": "total_floor_area",
            "Year of construction": "year_of_construction",
            "Building structure": "building_structure",
            "Use": "use",
            "Purpose of Use": "purpose",
            "Frontage roadFDirection": "frontage_road_direction",
            "Frontage roadFClassification": "frontage_road_classification",
            "Frontage roadFBreadth(m)": "frontage_road_breadth",
            "City Planning": "zone",
            "Maximus Building Coverage Ratio(%)": "max_building_coverage_ratio",
            "Maximus Floor-area Ratio(%)": "max_floor_area_ratio",
            "Renovation": "renovation_status",
            "Transactional factors": "transactional_factors",  # need to format and include
        }
    )

    # Process factors
    df["subarea"] = df["subarea"].fillna("")
    df["neighbourhood_classification"] = df["neighbourhood_classification"].fillna("na")
    df["nearest_station"] = df["nearest_station"].fillna("na")
    df["time_to_nearest_station"] = (
        df["time_to_nearest_station"].fillna("30-60minutes").apply(map_time_units)
    )
    df["layout"] = df["layout"].fillna("na").apply(map_layout)
    df["land_shape"] = df["land_shape"].fillna("na").map(map_land_shape)
    df["frontage"] = df["frontage"].fillna("na").apply(map_frontage)

    df["total_floor_area"] = np.where(
        df["total_floor_area"].isna(),
        df["trade_area"].astype(str),
        df["total_floor_area"],
    )

    df["total_floor_area"] = df["total_floor_area"].apply(map_floor_area)

    df["year_of_construction"] = np.where(
        df["year_of_construction"].isna(),
        (df["year"] - 30).astype(str),
        df["year_of_construction"],
    )

    df["year_of_construction"] = df["year_of_construction"].apply(
        map_year_of_construction
    )
    df["age"] = (df["year"] - df["year_of_construction"]).apply(lambda x: max(x, 0))

    df["building_structure"] = df["building_structure"].fillna("na")
    df["frontage_road_direction"] = df["frontage_road_direction"].fillna("na")
    df["frontage_road_classification"] = df["frontage_road_classification"].fillna("na")
    df["frontage_road_breadth"] = (
        df["frontage_road_breadth"].fillna("0.0").astype(float)
    )
    df["zone"] = df["zone"].fillna("na")
    df["max_building_coverage_ratio"] = df["max_building_coverage_ratio"].fillna(0)
    df["max_floor_area_ratio"] = df["max_floor_area_ratio"].fillna(0)
    df["renovation_status"] = df["renovation_status"].fillna("na")

    df["use"] = df["use"].fillna("na")
    df["purpose"] = df["purpose"].fillna("na")
    df["transactional_factors"] = df["transactional_factors"].fillna("na")

    df = df.drop(
        columns=[
            "City,Town,Ward,Village code",
            "City,Town,Ward,Village",
            "Transaction-price(total)",
            "Area(m^2)",
            "Transaction-price(Unit price m^2)",
            "trade_price_per_area",
            "Transaction period",
            "Prefecture",
        ]
    )

    # Convert towns into Longitude and Latitude
    df = df.merge(coordinates_df, on=["area", "area_code", "subarea"], how="left")

    return df

In [113]:
def is_contiguous(arr):
    sorted_arr = sorted(arr)
    return all((y2 - y1) == 1 for y1, y2 in zip(sorted_arr, sorted_arr[1:]))

def is_up_to_date(years, end_year):
    return max(years) == end_year

In [114]:
id_columns = ["year"]
metric_columns = ["unit_price_log"]

id_columns = ["year"]
metric_columns = ["unit_price_log"]

# land_df.info()

land_columns = {
    "numerical_columns": [
        "long",
        "lat",
        "time_to_nearest_station",
        "trade_area",
        "frontage",
        "frontage_road_breadth",
        "max_building_coverage_ratio",
        "max_floor_area_ratio",
    ],
    "categorical_columns": [
        "neighbourhood_classification",
        "quarter",
        "zone",
        "purpose",
        "land_shape",
        "frontage_road_direction",
        "frontage_road_classification",
    ],
    "comma_separated_categorical_columns": [
        "transactional_factors",
    ],
}

# building_df.info()

building_columns = {
    "numerical_columns": [
        "long",
        "lat",
        "time_to_nearest_station",
        "total_floor_area",
        "trade_area",
        "age",
        "frontage",
        "frontage_road_breadth",
        "max_building_coverage_ratio",
        "max_floor_area_ratio",
    ],
    "categorical_columns": [
        "neighbourhood_classification",
        "quarter",
        "zone",
        "purpose",
        "land_shape",
        "frontage_road_direction",
        "frontage_road_classification",
    ],
    "comma_separated_categorical_columns": [
        "use",
        "transactional_factors",
        "building_structure",
    ],
}

# condo_df.info()

condo_columns = {
    "numerical_columns": [
        "long",
        "lat",
        "time_to_nearest_station",
        "trade_area",
        "age",
        "max_building_coverage_ratio",
        "max_floor_area_ratio",
    ],
    "categorical_columns": [
        "neighbourhood_classification",
        "quarter",
        "zone",
        "purpose",
        "renovation_status",
    ],
    "comma_separated_categorical_columns": [
        "use",
        "transactional_factors",
        "building_structure",
    ],
}

columns_dicts = {
    "land": land_columns,
    "building": building_columns,
    "condo": condo_columns,
}

In [120]:
def get_price_index_from_regression(regression_results, prefix=""):
    yearly_results = regression_results.params.filter(like="year_")
    yearly_p_value = regression_results.pvalues.filter(like="year_")

    yearly_results.name = f"{prefix}price_index"
    yearly_p_value.name = f"{prefix}p_value"

    results_df = pd.concat([yearly_results, yearly_p_value], axis=1)
    results_df = results_df.reset_index().rename(columns={"index": "year"})
    results_df["year"] = results_df["year"].apply(lambda x: x.split("_")[1])
    return results_df


def calculate_price_index(area_df, columns_dict):
    numerical_columns = columns_dict["numerical_columns"]
    categorical_columns = columns_dict["categorical_columns"]
    comma_separated_categorical_columns = columns_dict[
        "comma_separated_categorical_columns"
    ]
    all_unprocessed_columns = (
        numerical_columns + categorical_columns + comma_separated_categorical_columns
    )

    area_df["year"] = area_df["year"].astype(str)
    area_df["quarter"] = area_df["quarter"].astype(str)
    area_df[f"unit_price_log"] = np.log(area_df["unit_price"] + 1)

    # Numerical variables
    scaler = StandardScaler()

    for column in numerical_columns:
        area_df[column] = area_df[column].fillna(area_df[column].mean())
        area_df[column] = np.log(area_df[column] + 1)

    area_df[numerical_columns] = scaler.fit_transform(area_df[numerical_columns])

    # Categorical variables
    area_df = pd.get_dummies(
        area_df[all_unprocessed_columns + id_columns + metric_columns],
        columns=categorical_columns,
    )

    # Comma Separated Categorical variables
    for column in comma_separated_categorical_columns:
        one_hot_df = (
            area_df[column]
            .str.replace(" ", "")
            .str.get_dummies(sep=",")
            .rename(columns=lambda x: column + x)
        )
        area_df = pd.concat([area_df, one_hot_df], axis=1)
        area_df = area_df.drop(columns=[column])

    drop_columns = [col for col in area_df.columns if "na" in col]
    area_df = area_df.drop(columns=drop_columns)

    # Model
    X_ord = area_df.drop(columns=id_columns + metric_columns)

    # Perform PCA for dimensionality reduction
    pca = PCA(n_components=0.90)  # Retain p% of the variance
    X_pca = pca.fit_transform(X_ord)

    # Expand years
    yearly_df = area_df[id_columns + metric_columns]
    yearly_df = pd.get_dummies(yearly_df, columns=id_columns)
    yearly_df = pd.concat(
        [
            yearly_df,
            pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(pca.n_components_)]),
        ],
        axis=1,
    )

    X = yearly_df.drop(columns=metric_columns)
    y = yearly_df[metric_columns[0]]

    # Add a constant to the model (the intercept)
    X = sm.add_constant(X)

    # Fit the OLS model
    ordinary_results = sm.OLS(y, X).fit()
    robust_results = sm.RLM(y, X).fit()

    # Compile results
    ordinary_results_df = get_price_index_from_regression(
        ordinary_results, prefix="ols_"
    ).sort_values("year")

    robust_results_df = get_price_index_from_regression(
        robust_results, prefix="robust_"
    ).sort_values("year")

    return ordinary_results_df, robust_results_df

In [121]:
trade_prices_data_path = f"{DATA_DIRECTORY_PATH}/core"

trade_prices_data_paths = [
    f"{trade_prices_data_path}/{filename}"
    for filename in os.listdir(trade_prices_data_path)
]

In [124]:
derived_dfs = {
    "land": pd.DataFrame(),
    "building": pd.DataFrame(),
    "condo": pd.DataFrame(),
}

final_year = 2022

pbar = tqdm(total=len(trade_prices_data_paths))

for path in trade_prices_data_paths:
    main_df = pd.read_csv(path, encoding="unicode_escape", index_col="No")
    main_df = process_transactions_df(main_df.copy())

    for asset_type, asset_type_props in asset_types.items():
        df = main_df[main_df["asset_type"] == asset_type_props["label"]]
        df = df[df["year"] <= final_year]
        prefecture_df = pd.DataFrame()

        for area_code in df["area_code"].unique():
            area_df = df[df["area_code"] == area_code].reset_index(drop=True)
            if is_contiguous(area_df["year"].unique()) and is_up_to_date(
                area_df["year"].unique(), final_year
            ):
                ordinary_results_df, robust_results_df = calculate_price_index(
                    area_df.copy(), columns_dicts[asset_type]
                )

                results_df = pd.concat([ordinary_results_df, robust_results_df], axis=1)
                results_df["area_code"] = area_code
                results_df["area"] = area_df["area"].iloc[0]

                prefecture_df = pd.concat([prefecture_df, results_df], axis=0)

        derived_dfs[asset_type] = pd.concat([derived_dfs[asset_type], prefecture_df])

    pbar.update()

pbar.close()

 96%|█████████▌| 45/47 [16:14<00:26, 13.26s/it]

ZeroDivisionError: float division by zero

In [125]:
pbar.close()

area_df.copy()

 96%|█████████▌| 45/47 [16:39<00:44, 22.21s/it]


Unnamed: 0,asset_type,neighbourhood_classification,subarea,nearest_station,time_to_nearest_station,layout,land_shape,frontage,total_floor_area,year_of_construction,building_structure,use,purpose,frontage_road_direction,frontage_road_classification,frontage_road_breadth,zone,max_building_coverage_ratio,max_floor_area_ratio,renovation_status,transactional_factors,area_code,area,trade_price,trade_area,unit_price,quarter,year,age,long,lat
0,Residential Land(Land and Building),Residential Area,Oaza Iojima,na,45,na,rectangular shaped,12,75,1988,W,House,na,East,Village Road,4.5,Outside City Planning Area,0.0,0.0,na,na,46303,Kagoshima-ken Mishima-mura,1800000,250,7200.0,2,2022,34,30.790266,130.29334
1,Residential Land(Land and Building),Residential Area,Oaza Iojima,na,45,na,irregular shaped,33,70,1991,W,House,na,Southwest,Village Road,4.0,Outside City Planning Area,0.0,0.0,na,na,46303,Kagoshima-ken Mishima-mura,2100000,260,8076.923077,4,2021,30,30.790266,130.29334


In [65]:
def calculate_price_index_test(area_df, columns_dict):
    numerical_columns = columns_dict["numerical_columns"]
    categorical_columns = columns_dict["categorical_columns"]
    comma_separated_categorical_columns = columns_dict[
        "comma_separated_categorical_columns"
    ]
    all_unprocessed_columns = (
        numerical_columns + categorical_columns + comma_separated_categorical_columns
    )

    area_df["year"] = area_df["year"].astype(str)
    area_df["quarter"] = area_df["quarter"].astype(str)
    area_df[f"unit_price_log"] = np.log(area_df["unit_price"] + 1)

    # Numerical variables
    scaler = StandardScaler()

    for column in numerical_columns:
        area_df[column] = np.log(area_df[column] + 1)

    area_df[numerical_columns] = scaler.fit_transform(area_df[numerical_columns])

    # Categorical variables
    area_df = pd.get_dummies(
        area_df[all_unprocessed_columns + id_columns + metric_columns],
        columns=categorical_columns,
    )

    # Comma Separated Categorical variables
    for column in comma_separated_categorical_columns:
        one_hot_df = (
            area_df[column]
            .str.replace(" ", "")
            .str.get_dummies(sep=",")
            .rename(columns=lambda x: column + x)
        )
        area_df = pd.concat([area_df, one_hot_df], axis=1)
        area_df = area_df.drop(columns=[column])

    drop_columns = [col for col in area_df.columns if "na" in col]
    area_df = area_df.drop(columns=drop_columns)

    # Model
    X_ord = area_df.drop(columns=id_columns + metric_columns)

    # Perform PCA for dimensionality reduction
    pca = PCA(n_components=0.90)  # Retain p% of the variance
    X_pca = pca.fit_transform(X_ord)

    # Expand years
    yearly_df = area_df[id_columns + metric_columns]

    yearly_df = pd.get_dummies(yearly_df, columns=id_columns)
    return yearly_df, X_pca

    yearly_df = pd.concat(
        [
            yearly_df,
            pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(pca.n_components_)]),
        ],
        axis=1,
    )


    X = yearly_df.drop(columns=metric_columns)
    y = yearly_df[metric_columns[0]]

    # Add a constant to the model (the intercept)
    X = sm.add_constant(X)

    # Fit the OLS model
    ordinary_results = sm.OLS(y, X).fit()
    robust_results = sm.RLM(y, X).fit()

    # Compile results
    ordinary_results_df = get_price_index_from_regression(
        ordinary_results, prefix="ols_"
    ).sort_values("year")

    robust_results_df = get_price_index_from_regression(
        robust_results, prefix="robust_"
    ).sort_values("year")

    return ordinary_results_df, robust_results_df

In [78]:
ydf, xpca = calculate_price_index_test(area_df.copy(), columns_dicts["land"])


In [67]:
xpca.shape

(6679, 20)

In [68]:
ydf.shape

(6679, 18)