In [489]:
import os
import warnings

import pandas as pd
import numpy as np


from jre_utils.datapath import (
    DATA_DIRECTORY_PATH,
)
from jre_utils.config import asset_types
from jp_prefecture.jp_cities import JpCity, jp_cities
from geopy.geocoders import Nominatim

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [443]:
def area_to_int(area):
    if area == "2,000 m^2 or greater.":
        return 2000
    elif area == "5000 m^2 or greater.":
        return 5000
    else:
        return int(area)


def map_time_units(x):
    mapping = {
        "30-60minutes": 45,
        "1H-1H30": 75,
        "1H30-2H": 105,
        "2H-": 135,
    }
    
    return int(x) if x.isdigit() else mapping[x]

def map_layout(x):
    if x == "na":
        return "na"

    x = x.split("+")[0]

    if x[0].isdigit() and int(x[0]) == 1:
        if x != "1K" and x != "1LDK" and x != "1DK":
            return "1other"
        return x
    
    if x[0].isdigit() and int(x[0]) == 2:
        if x != "2LDK" and x != "2DK":
            return "2other"
        return x

    if x[0].isdigit() and int(x[0]) > 2:
        return f"{min(int(x[0]), 5)}LDK"
    
    return "other"

def map_land_shape(x):
    x = x.lower()
    x = x.replace("semi-", "")
    return x

def map_frontage(x):
    if x == "na":
        return 0
    if x == "50.0m or longer.":
        return 55
    return int(x.split(".")[0])

def map_floor_area(x):
    if "less" in x:
        return 10
    elif "greater" in x:
        return 2000
    else:
        return int(x)
    
def map_year_of_construction(x):
    if x == "before the war":
        return 1930
    
    return int(x)

def map_building_structure(x):
    if len(x.split(",")) > 1:
        return "combo"
    else:
        return x
    
def map_combined_use(x):
    if "Factory" in x:
        return "Factory"
    elif "Warehouse" in x:
        return "Warehouse"
    elif "Parking Lot" in x:
        return "Parking Lot"
    elif "Office" in x:
        return "Office"
    elif "Housing Complex" in x:
        return "Housing Complex"
    else:
        return x.split(",")[0]

In [499]:
import json
from pprint import pprint

sub_city_to_city_path = f"{DATA_DIRECTORY_PATH}/core_scraped/sub_city_to_city.json"
with open(sub_city_to_city_path) as fd:
     sub_city_to_city = json.load(fd)
     pprint(f"E.g. Maps 1101 to {sub_city_to_city['1101']}")

area_code_to_area_path = f"{DATA_DIRECTORY_PATH}/core_scraped/area_code_to_area.json"
with open(area_code_to_area_path) as fd:
     area_code_to_area = json.load(fd)
     pprint(f"E.g. Maps 1100 to {area_code_to_area['1100']}") 

def get_city_code(area_code):
     return sub_city_to_city.get(area_code, area_code)

def get_area_from_area_code(area_code):
     return area_code_to_area.get(area_code, "na" )

def get_city_geocode(area_code):
    area_code = str(area_code)
    try:
        return tuple(jp_cities.citycode2geodetic(area_code)) 
    except:
        print(f"Could not find geocode for {area_code}")
        return np.NaN, np.NaN
    
def find_town_jp(all_towns_df, city_code, town_name, log=False):
    city_df = all_towns_df[all_towns_df["cityCode"] == int(city_code)]
    town_df = city_df[city_df["townAlphabet"].str.contains(town_name)]
    
    if town_df.empty:
        if log:
            print(f"JP could not find {town_name} in {city_code}")
        return None, None
    
    return town_df["longitude"].mean(), town_df["latitude"].mean()

def find_town_geopy(geolocator, address, log=False):
    location_info = geolocator.geocode(address)

    if not location_info:
        if log:
            print(f"Geopy could not find {address}")
        return None, None
    
    return location_info.longitude, location_info.latitude

def get_town_coordinates_jp(all_towns_df, city_name, city_code, town_name, log=False):
    geolocator = Nominatim(user_agent="my_app")
    
    # Try to locade in df - this is fast
    jp_lon, jp_lat = find_town_jp(all_towns_df, city_code, town_name, log)

    if not jp_lon or not jp_lat:
        address = f"{town_name}, {city_name}, Japan"
        
        # Fall back and try to locate with geopy - this is slow
        geopy_lon, geopy_lat = find_town_geopy(geolocator, address, log)
        
        if not geopy_lon or not geopy_lat:
            # if nothing works, just return the city coordinates
            return get_city_geocode(city_code)
        
        return geopy_lon, geopy_lat

    return jp_lon, jp_lat



In [445]:
prefecture_code = 13
trade_prices_data_path = f"{DATA_DIRECTORY_PATH}/core"

trade_prices_data_paths = [
    f"{trade_prices_data_path}/{filename}"
    for filename in sorted(os.listdir(trade_prices_data_path))
]
trade_prices_data_paths[prefecture_code - 1]

'../../data/core/13_Tokyo_20053_20233.csv'

In [765]:
df = pd.read_csv(
    trade_prices_data_paths[prefecture_code - 1],
    encoding="unicode_escape",
    index_col="No",
)
df["area_code"] = df["City,Town,Ward,Village code"].astype(str)

# we may want to skip the following step in the future
df["area_code"] = df["area_code"].apply(get_city_code).astype(str)
df["area"] = df["area_code"].apply(get_area_from_area_code)

df["trade_price"] = df["Transaction-price(total)"]
df["trade_area"] = df["Area(m^2)"].apply(area_to_int)
df["unit_price"] = df["Transaction-price(Unit price m^2)"]
df["trade_price_per_area"] = df["trade_price"] / df["trade_area"]

df["quarter"] = df["Transaction period"].apply(lambda x: int(x.split(" ")[0][0]))
df["year"] = df["Transaction period"].apply(lambda x: int(x.split(" ")[2]))

df["unit_price"] = np.where(
    df["unit_price"].isna(),
    df["trade_price_per_area"],
    df["unit_price"],
)

df = df[
    df["Type"].isin(
        [
            asset_types["building"]["label"],
            asset_types["land"]["label"],
            asset_types["condo"]["label"],
        ],
    )
]

# Renaming

df = df.rename(columns = {
    "Type": "asset_type",
    "Region": "neighbourhood_classification",
    "Area": "subarea",
    "Nearest stationFName": "nearest_station",
    "Nearest stationFDistance(minute)": "time_to_nearest_station",
    "Layout": "layout",
    "Land shape": "land_shape",
    "Frontage": "frontage",
    "Total floor area(m^2)": "total_floor_area",
    "Year of construction": "year_of_construction",
    "Building structure": "building_structure",
    "Use": "use",
    "Purpose of Use": "purpose",
    "Frontage roadFDirection": "frontage_road_direction",
    "Frontage roadFClassification": "frontage_road_classification",
    "Frontage roadFBreadth(m)": "frontage_road_breadth",
    "City Planning": "zone",
    "Maximus Building Coverage Ratio(%)": "max_building_coverage_ratio",
    "Maximus Floor-area Ratio(%)": "max_floor_area_ratio",
    "Renovation": "renovation_status",
})

# Process factors
df["subarea"] = df["subarea"].fillna("")
df["neighbourhood_classification"] = df["neighbourhood_classification"].fillna("na")
df["nearest_station"] = df["nearest_station"].fillna("na")
df["time_to_nearest_station"] = df["time_to_nearest_station"].fillna("30-60minutes").apply(map_time_units)
df["layout"] = df["layout"].fillna("na").apply(map_layout)
df["land_shape"] = df["land_shape"].fillna("na").map(map_land_shape)
df["frontage"] = df["frontage"].fillna("na").apply(map_frontage)

df["total_floor_area"] = np.where(
    df["total_floor_area"].isna(),
    df["trade_area"].astype(str),
    df["total_floor_area"],
)

df["total_floor_area"] = df["total_floor_area"].apply(map_floor_area)


df["year_of_construction"] = np.where(
    df["year_of_construction"].isna(),
    (df["year"] - 30).astype(str),
    df["year_of_construction"],
)

df["year_of_construction"] = df["year_of_construction"].apply(map_year_of_construction)
df["age"] = df["year"] - df["year_of_construction"]
df["building_structure"] = df["building_structure"].fillna("na").map(map_building_structure)
df["frontage_road_direction"] = df["frontage_road_direction"].fillna("na")
df["frontage_road_classification"] = df["frontage_road_classification"].fillna("na")
df["frontage_road_breadth"] = df["frontage_road_breadth"].fillna("0.0").astype(float)
df["zone"] = df["zone"].fillna("na")
df["max_building_coverage_ratio"] = df["max_building_coverage_ratio"].fillna(0)
df["max_floor_area_ratio"] = df["max_floor_area_ratio"].fillna(0)
df["renovation_status"] = df["renovation_status"].fillna("na")

df["combined_use"] = np.where(
    df["purpose"].isna(),
    df["use"],
    df["purpose"],
)
df["combined_use"] = df["combined_use"].fillna("na").apply(map_combined_use)

df = df.drop(columns = [
    "City,Town,Ward,Village code",
    "City,Town,Ward,Village",
    "Transaction-price(total)",
    "Area(m^2)",
    "Transaction-price(Unit price m^2)",
    "trade_price_per_area",
    "Transaction period",
    "Prefecture",
    "Transactional factors",
    "year_of_construction", 
    "use",
    "purpose"
])


In [767]:
# Advanced preprocessing

# Convert towns into Longitude and Latitude
jp_towns = JpCity(enable_town=True)
towns_df = jp_towns.towns
towns_df["townAlphabet"] = towns_df["townAlphabet"].fillna("")

towns_list_df = df[["area", "area_code", "subarea"]].drop_duplicates()
towns_list_df[["long", "lat"]] = towns_list_df.apply(
    lambda x: pd.Series(get_town_coordinates_jp(towns_df, x["area"], x["area_code"], x["subarea"])),
    axis=1,
)

df = df.merge(towns_list_df, on=["area", "area_code", "subarea"], how="left")


In [537]:
# For each asset type, for each area_code
# 1. Run clustering algorithm and get cluster code for each transaction
# 2. Run regression to identify weights for each year

In [1247]:
yearly_normalize_columns = [
    f"unit_price_log",
    # f"unit_price_log_normalized"
]

numerical_columns = [
    "long",
    "lat",
    "time_to_nearest_station",
    "total_floor_area",
    "trade_area",
    "age",
    "frontage",
    "frontage_road_breadth",
    "max_building_coverage_ratio",
    "max_floor_area_ratio",
]

categorical_columns = [
    "neighbourhood_classification",
    "zone",
    "renovation_status",
    "combined_use",
    "layout",
    "building_structure",
    # "land_shape",
    # "frontage_road_direction",
    # "frontage_road_classification",
]

id_columns = ["year"]
metric_columns = ["unit_price"]

columns = yearly_normalize_columns + numerical_columns + categorical_columns


In [1248]:
land_df = df[df["asset_type"] == asset_types["land"]["label"]]
building_df = df[df["asset_type"] == asset_types["building"]["label"]]
condo_df = df[df["asset_type"] == asset_types["condo"]["label"]]

In [1249]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# for area_code X, you have n clusters

area_df = land_df[land_df["area_code"] == "13101"].reset_index(drop=True)

for column in numerical_columns:
    area_df[column] = np.log(area_df[column] + 1)

# Scale numerical variables
scaler = StandardScaler()
# scaler = MinMaxScaler()
area_df[numerical_columns] = scaler.fit_transform(area_df[numerical_columns])


for column in metric_columns:
    area_df[f"{column}_log"] = np.log(area_df[column] + 1)
    area_df[f"{column}_log_normalized"] = area_df.groupby("year")[f"{column}_log"].transform(
        lambda x: (x - x.mean()) / x.std()
    )

# Categorical variables
area_df = pd.get_dummies(area_df[columns + id_columns + metric_columns], columns=categorical_columns)
drop_columns = [col for col in area_df.columns if "na" in col]
area_df = area_df.drop(columns=drop_columns)
area_df

Unnamed: 0,unit_price_log,long,lat,time_to_nearest_station,total_floor_area,trade_area,age,frontage,frontage_road_breadth,max_building_coverage_ratio,max_floor_area_ratio,year,unit_price,neighbourhood_classification_Commercial Area,neighbourhood_classification_Residential Area,zone_Category I Residential Zone,zone_Category II Residential Zone,zone_Commercial Zone,combined_use_House,combined_use_Office,combined_use_Other,combined_use_Shop
0,15.687313,0.112834,-0.112281,0.439975,0.718715,0.718715,0.0,-3.111046,0.368334,0.180546,0.209849,2023,6500000.0,1,0,0,0,1,1,0,0,0
1,15.404746,0.112834,-0.112281,0.439975,0.349914,0.349914,0.0,-0.764488,0.368334,0.180546,0.209849,2023,4900000.0,1,0,0,0,1,1,0,0,0
2,15.607270,0.112834,-0.112281,0.439975,-0.688347,-0.688347,0.0,-0.273912,0.382125,0.180546,-0.241017,2023,6000000.0,1,0,0,0,1,1,0,0,0
3,14.845130,0.112488,-0.114569,0.439975,1.290053,1.290053,0.0,0.628646,1.050334,-0.902801,-0.792607,2019,2800000.0,0,1,0,1,0,0,0,1,0
4,15.201805,0.114170,-0.113394,-0.070255,-0.252093,-0.252093,0.0,-0.079224,2.233339,0.180546,0.591160,2021,4000000.0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,14.690980,0.112654,-0.112641,0.439975,1.412401,1.412401,0.0,1.639253,-0.286084,-0.902801,-0.792607,2006,2400000.0,0,1,0,1,0,0,0,0,0
311,13.795309,0.112654,-0.112641,0.856863,0.236049,0.236049,0.0,-1.509271,-1.234288,-0.902801,-0.792607,2006,980000.0,1,0,0,1,0,0,0,0,0
312,14.508658,0.113347,-0.113761,-0.728055,0.770241,0.770241,0.0,0.931384,-0.760186,0.180546,-0.241017,2007,2000000.0,1,0,0,0,1,0,0,0,0
313,11.918397,0.112106,-0.114543,1.209337,-0.688347,-0.688347,0.0,1.019774,-3.713018,-0.902801,-0.792607,2005,150000.0,0,1,1,0,0,0,0,0,0


In [1252]:
X = area_df.drop(columns=id_columns + metric_columns)

In [1239]:
from sklearn.decomposition import PCA

# Perform PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_pca = pca.fit_transform(X)

# Print the number of components retained by PCA
print("Number of components retained by PCA:", pca.n_components_)

Number of components retained by PCA: 1


In [1240]:
from sklearn.cluster import DBSCAN

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
clusters = dbscan.fit_predict(X_pca)

# Add cluster labels to the DataFrame
area_df['dbscan_cluster'] = clusters

# Print the resulting clusters
print(area_df['dbscan_cluster'].value_counts())

-1     131
 19     18
 14     17
 16     13
 13     12
 6      12
 4      11
 17      9
 10      9
 8       8
 7       8
 5       8
 0       7
 11      7
 1       7
 15      6
 2       6
 18      6
 9       5
 12      5
 3       5
 20      5
Name: dbscan_cluster, dtype: int64


In [1241]:
from sklearn.cluster import KMeans

# K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(X_pca)

# Add K-means cluster labels to the DataFrame
area_df['kmeans_cluster'] = kmeans_labels

# Print the resulting clusters for K-means
print("K-means Clustering Results:")
print(area_df['kmeans_cluster'].value_counts())


K-means Clustering Results:
0    133
3     92
2     53
4     22
1     15
Name: kmeans_cluster, dtype: int64


In [1242]:
from hdbscan import HDBSCAN

# HDBSCAN clustering
hdbscan = HDBSCAN(min_cluster_size=5, min_samples=5)
hdbscan_labels = hdbscan.fit_predict(X_pca)

# Add HDBSCAN cluster labels to the DataFrame
area_df['hdbscan_cluster'] = hdbscan_labels

# Print the resulting clusters for HDBSCAN
print("HDBSCAN Clustering Results:")
print(area_df['hdbscan_cluster'].value_counts())

HDBSCAN Clustering Results:
-1     52
 22    22
 9     18
 10    17
 3     15
 23    14
 1     13
 17    13
 5     12
 8     12
 14    12
 6     11
 0     10
 20    10
 11     9
 2      9
 13     9
 15     8
 21     8
 16     8
 7      7
 18     7
 19     7
 4      6
 12     6
Name: hdbscan_cluster, dtype: int64


In [1243]:
area_df[area_df["hdbscan_cluster"] == 7].round(3)


Unnamed: 0,long,lat,time_to_nearest_station,trade_area,age,frontage,frontage_road_breadth,max_building_coverage_ratio,max_floor_area_ratio,year,unit_price,neighbourhood_classification_Commercial Area,neighbourhood_classification_Residential Area,zone_Category I Residential Zone,zone_Category II Residential Zone,zone_Commercial Zone,combined_use_House,combined_use_Office,combined_use_Other,combined_use_Shop,dbscan_cluster,kmeans_cluster,hdbscan_cluster
26,0.113,-0.113,-0.728,0.11,0.0,0.385,-0.506,0.181,-0.241,2019,3000000.0,1,0,0,0,1,0,1,0,0,11,2,7
106,0.114,-0.114,-0.07,1.078,0.0,1.182,0.239,0.181,0.21,2014,3000000.0,1,0,0,0,1,0,0,0,0,11,2,7
157,0.114,-0.113,-0.728,-0.688,0.0,-0.079,0.382,0.181,0.21,2016,3000000.0,1,0,0,0,1,1,0,0,0,11,2,7
177,0.113,-0.114,1.209,-0.88,0.0,-0.764,0.382,0.181,-0.241,2015,3000000.0,1,0,0,0,1,0,0,0,0,11,2,7
212,0.113,-0.112,-0.728,-0.688,0.0,-0.274,0.382,0.181,-0.241,2013,3000000.0,1,0,0,0,1,0,0,0,0,11,2,7
284,0.113,-0.113,-1.655,-2.338,0.0,-0.499,1.182,0.181,0.591,2006,3000000.0,1,0,0,0,1,0,0,0,0,11,2,7
308,0.112,-0.116,0.44,0.42,0.0,-1.09,-0.148,-0.903,-0.793,2008,3000000.0,0,1,0,1,0,0,0,0,0,11,2,7


In [1244]:
import plotly.express as px
for cluster in ["hdbscan_cluster", "dbscan_cluster", "kmeans_cluster"]:
    fig = px.scatter(area_df, x="year", y=cluster, color="lat", width=1000)
    fig.show()

In [1245]:
# Separate each cluster into its own DataFrame
cluster_column = "hdbscan_cluster"
combined_cluster_df = pd.DataFrame()
unique_clusters = area_df[cluster_column].unique()

for cluster in unique_clusters:
    cluster_df = area_df[area_df[cluster_column] == cluster].copy()
    yearly_price = cluster_df.groupby(['year'])['unit_price'].apply(lambda x: np.exp(np.log(x).mean()))
    returns_df = pd.DataFrame(index=range(yearly_price.index.min(), yearly_price.index.max() + 1))

    # Calculate annualized returns and fill in missing years
    for i in range(len(yearly_price)):
        if i == 0:
            continue
        start_year = yearly_price.index[i-1]
        end_year = yearly_price.index[i]
        start_price = yearly_price[start_year]
        end_price = yearly_price[end_year]
        years_diff = end_year - start_year
        annualized_return = (end_price / start_price) ** (1 / years_diff) - 1
        returns_df.loc[start_year + 1 :end_year + 1, 'annualized_return'] = annualized_return
    
    returns_df = returns_df.dropna()
    returns_df = returns_df.assign(cluster_code = cluster)
    returns_df = returns_df.reset_index().rename(columns={"index": "year"})
    combined_cluster_df = pd.concat([combined_cluster_df, returns_df])

In [1246]:
from jre_utils.visualize import plot_time_series


plot_time_series(combined_cluster_df, column="annualized_return", group_by_columns=["cluster_code", "year"], granularity_columns=["cluster_code"], title="Returns by cluster")

In [None]:
# Can we use future data for pca in for clustering?

In [None]:
# Now that I have a dense vector - I can append my year dummies and compute the returns for each year + pca