In [1]:
import os
import warnings
import math
import weighted

import pandas as pd
import numpy as np
import statsmodels.api as sm

from tqdm import tqdm

from jre_utils.datapath import (
    DATA_DIRECTORY_PATH,
    get_derived_csv_path,
)
from jre_utils.config import asset_types

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
trade_prices_data_path = f"{DATA_DIRECTORY_PATH}/core"

trade_prices_data_paths = [
    f"{trade_prices_data_path}/{filename}"
    for filename in os.listdir(trade_prices_data_path)
]

In [15]:
df = pd.read_csv(trade_prices_data_paths[4], encoding="unicode_escape", index_col="No")
df.head(50)

Unnamed: 0_level_0,Type,Region,"City,Town,Ward,Village code",Prefecture,"City,Town,Ward,Village",Area,Nearest stationFName,Nearest stationFDistance(minute),Transaction-price(total),Layout,Area(m^2),Transaction-price(Unit price m^2),Land shape,Frontage,Total floor area(m^2),Year of construction,Building structure,Use,Purpose of Use,Frontage roadFDirection,Frontage roadFClassification,Frontage roadFBreadth(m),City Planning,Maximus Building Coverage Ratio(%),Maximus Floor-area Ratio(%),Transaction period,Renovation,Transactional factors
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
1,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,3.0,130000000,1LDK,60,,,,,2021.0,RC,House,House,,,,Category II Residential Zone,60.0,400.0,2nd quarter 2023,Not yet,
2,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,3.0,150000000,2LDK,60,,,,,2021.0,RC,House,House,,,,Category II Residential Zone,60.0,400.0,2nd quarter 2023,Not yet,
3,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,3.0,13000000,1K,15,,,,,1985.0,SRC,House,House,,,,Commercial Zone,80.0,700.0,4th quarter 2022,,
4,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,2.0,44000000,1LDK,45,,,,,1982.0,SRC,House,House,,,,Commercial Zone,80.0,700.0,4th quarter 2022,Done,
5,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,0.0,70000000,3LDK,55,,,,,1984.0,SRC,,House,,,,Commercial Zone,80.0,600.0,4th quarter 2022,Not yet,
6,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,3.0,120000000,Open Floor,60,,,,,1985.0,SRC,Shop,Shop,,,,Commercial Zone,80.0,700.0,3rd quarter 2022,Not yet,
7,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,3.0,150000000,2LDK,60,,,,,2021.0,RC,House,House,,,,Category II Residential Zone,60.0,400.0,3rd quarter 2022,Not yet,
8,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,2.0,130000000,2LDK,60,,,,,2021.0,RC,,House,,,,Commercial Zone,80.0,500.0,3rd quarter 2022,Not yet,
9,Residential Land(Land and Building),Commercial Area,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,1.0,890000000,,420,,Semi-rectangular Shaped,16.5,"2,000 m^2 or greater.",2008.0,RC,Office,Office,Northeast,Ward Road,11.0,Commercial Zone,80.0,500.0,2nd quarter 2022,,
10,"Pre-owned Condominiums, etc.",,13101,Tokyo,Chiyoda Ward,Iidabashi,Iidabashi,3.0,11000000,1K,15,,,,,1985.0,SRC,House,Other,,,,Commercial Zone,80.0,700.0,2nd quarter 2022,Not yet,


In [18]:
# Oh shit, we are leaving behind a lot of data in pre-owned condominiums
df["Type"].value_counts()

Type
Pre-owned Condominiums, etc.           250051
Residential Land(Land and Building)    184604
Residential Land(Land Only)             98244
Forest Land                               608
Agricultural Land                         137
Name: count, dtype: int64

In [21]:
df["Purpose of Use"].value_counts()

Purpose of Use
House        242985
Other         24474
Office         5276
Shop           3175
Warehouse       862
Factory         467
Name: count, dtype: int64

In [17]:
df["Use"].value_counts()

Use
House                                                356759
Housing Complex                                       16374
House, Shop                                            3406
Office                                                 3194
Shop                                                   2248
                                                      ...  
Factory, Parking Lot, Shop                                1
Housing Complex, Factory, Office, Workshop                1
Factory, Workshop, Other                                  1
Housing Complex, Workshop, Warehouse, Parking Lot         1
Warehouse, Shop, Other                                    1
Name: count, Length: 216, dtype: int64

In [19]:
# Once we've decided on a Area Code, we can drill down to City Planning to identify the type of zone we want to invest in
# Same for Use
df["City Planning"].value_counts()

City Planning
Category I Exclusively Low-story Residential Zone       129904
Commercial Zone                                          95719
Quasi-industrial Zone                                    74733
Category I Exclusively Medium-high Residential Zone      74185
Category I Residential Zone                              58270
Neighborhood Commercial Zone                             51686
Category II Exclusively Medium-high Residential Zone     12743
Category II Residential Zone                             10857
Industrial Zone                                           7411
Quasi-residential Zone                                    7112
Category II Exclusively Low-story Residential Zone        3768
Non-divided City Planning Area                             830
Urbanization Control Area                                  820
Exclusively Industrial Zone                                346
Outside City Planning Area                                 209
Quasi-city Planning Area                 

In [23]:
# split into Purpose of Use
# Frontfill each and smooth to get a consistent chart
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 533644 entries, 1 to 533644
Data columns (total 28 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Type                                533644 non-null  object 
 1   Region                              282848 non-null  object 
 2   City,Town,Ward,Village code         533644 non-null  int64  
 3   Prefecture                          533644 non-null  object 
 4   City,Town,Ward,Village              533644 non-null  object 
 5   Area                                533398 non-null  object 
 6   Nearest stationFName               531523 non-null  object 
 7   Nearest stationFDistance(minute)   522855 non-null  object 
 8   Transaction-price(total)            533644 non-null  int64  
 9   Layout                              240590 non-null  object 
 10  Area(m^2)                           533644 non-null  object 
 11  Transaction-price(Unit price m^