In [1]:
import warnings

import pandas as pd
import numpy as np

from jre_utils.datapath import factor_data_paths
from jre_utils.constants import BLACKLISTED_AREAS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
all_area_population_estimate_path = factor_data_paths["processed"]["population"]["submunicipality"]
all_area_taxpayer_count_path = factor_data_paths["processed"]["taxpayer"]["submunicipality"]

taxable_income_processed_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
taxable_income_unprocessed_path = factor_data_paths["unprocessed"]["taxable_income"]["municipality"]

In [3]:
population_df = pd.read_csv(all_area_population_estimate_path)
taxpayer_df = pd.read_csv(all_area_taxpayer_count_path)

taxable_income_df = pd.read_csv(taxable_income_unprocessed_path)
taxable_income_df = taxable_income_df[~taxable_income_df["area"].isin(BLACKLISTED_AREAS)]
taxable_income_df = taxable_income_df.merge(taxpayer_df, on=["year", "area_code"], suffixes=["", "_x"], how="left")

taxable_income_df = taxable_income_df[taxable_income_df["taxable_income"].apply(lambda x: x.isdigit())]
taxable_income_df["taxable_income"] = taxable_income_df["taxable_income"].astype(float)

taxable_income_df["taxable_income_per_taxpayer"] = taxable_income_df["taxable_income"] / taxable_income_df["taxpayer_count"]

taxable_income_df = taxable_income_df.sort_values(by=["year", "area_code"]) # for growth calculation
taxable_income_df = taxable_income_df[~taxable_income_df[["year", "area_code"]].duplicated()]

taxable_income_df["taxable_income_growth"] = taxable_income_df.groupby("area_code")["taxable_income"].pct_change()
taxable_income_df["taxable_income_per_taxpayer_growth"] = taxable_income_df.groupby("area_code")["taxable_income_per_taxpayer"].pct_change()
taxable_income_df = taxable_income_df.sort_values(by=["year", "area_code"], ascending=[False, True])

taxable_income_df = taxable_income_df.drop(columns=["area_x"])

taxable_income_df

Unnamed: 0,year,area_code,area,taxable_income,taxpayer_count,taxpayer_count_growth,taxpayer_count_log,taxpayer_count_log_normalized_yearly,taxpayer_count_growth_normalized_yearly,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth
0,2021,1100,Hokkaido Sapporo-shi,2.971366e+09,906411.0,0.003444,5.957326,2.853349,0.336425,3278.166013,0.032400,0.028857
11,2021,1202,Hokkaido Hakodate-shi,3.194402e+08,107534.0,-0.015292,5.031550,1.505360,-0.705828,2970.597039,0.016612,0.032400
12,2021,1203,Hokkaido Otaru-shi,1.243333e+08,46190.0,-0.013582,4.664557,0.970996,-0.610690,2691.778827,0.028630,0.042793
13,2021,1204,Hokkaido Asahikawa-shi,4.193510e+08,142560.0,0.000681,5.154001,1.683656,0.182719,2941.575168,0.038224,0.037518
14,2021,1205,Hokkaido Muroran-shi,1.052086e+08,34835.0,-0.010875,4.542028,0.792585,-0.460104,3020.199455,0.009337,0.020434
...,...,...,...,...,...,...,...,...,...,...,...,...
70810,1985,47361,Okinawa-ken Kumejima-cho,3.992455e+06,1819.0,,3.260071,-1.095376,,2194.862562,,
70811,1985,47362,Okinawa-ken Yaese-cho,9.023653e+06,4997.0,,3.698796,-0.386018,,1805.814088,,
70812,1985,47375,Okinawa-ken Tarama-son,4.395900e+05,196.0,,2.294466,-2.656629,,2242.806122,,
70813,1985,47381,Okinawa-ken Taketomi-cho,1.525627e+06,663.0,,2.822168,-1.803406,,2301.096531,,


In [9]:
log_normalize_columns = [
    "taxable_income",
    "taxable_income_per_taxpayer",
]

normalize_columns = [
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
]

for column in log_normalize_columns:
    taxable_income_df[f"{column}_log"] = taxable_income_df[column].apply(
        lambda x: np.log10(1 + x)
    )
    taxable_income_df[f"{column}_log_normalized_yearly"] = taxable_income_df.groupby(
        "year"
    )[f"{column}_log"].transform(lambda x: (x - x.mean()) / x.std())

for column in normalize_columns:
    taxable_income_df[f"{column}_normalized_yearly"] = taxable_income_df.groupby(
        "year"
    )[column].transform(lambda x: (x - x.mean()) / x.std())

taxable_income_df.to_csv(taxable_income_processed_path, index=False)

In [4]:
taxable_income_df[taxable_income_df["area_code"] == 13102]

Unnamed: 0,year,area_code,area,taxable_income,taxpayer_count,taxpayer_count_growth,taxpayer_count_log,taxpayer_count_log_normalized_yearly,taxpayer_count_growth_normalized_yearly,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth
658,2021,13102,Tokyo-to Chuo-ku,725351426.0,101810.0,0.01752,5.007795,1.470771,1.119426,7124.559729,0.059288,0.041049
2573,2020,13102,Tokyo-to Chuo-ku,684753903.0,100057.0,0.030082,5.000252,1.460485,1.701562,6843.638156,0.021457,-0.008373
4488,2019,13102,Tokyo-to Chuo-ku,670370026.0,97135.0,0.039266,4.98738,1.445581,2.001342,6901.426118,0.108315,0.06644
6403,2018,13102,Tokyo-to Chuo-ku,604855042.0,93465.0,0.054755,4.970654,1.429122,2.455725,6471.460354,0.075667,0.019827
8318,2017,13102,Tokyo-to Chuo-ku,562306834.0,88613.0,0.053637,4.947502,1.402201,1.953868,6345.647185,0.082041,0.026958
10233,2016,13102,Tokyo-to Chuo-ku,519672356.0,84102.0,0.041653,4.924811,1.377242,1.46313,6179.072507,0.081736,0.03848
12148,2015,13102,Tokyo-to Chuo-ku,480405927.0,80739.0,0.052097,4.907089,1.360499,2.721036,5950.109947,0.055433,0.003171
14063,2014,13102,Tokyo-to Chuo-ku,455174215.0,76741.0,0.034259,4.885033,1.330405,1.570505,5931.304192,0.103712,0.067153
15978,2013,13102,Tokyo-to Chuo-ku,412402889.0,74199.0,0.034089,4.870404,1.309807,0.724862,5558.065324,0.051194,0.016541
17893,2012,13102,Tokyo-to Chuo-ku,392318713.0,71753.0,0.032373,4.855846,1.294434,0.730156,5467.628016,0.039087,0.006504
