In [4]:
import warnings

import numpy as np
import pandas as pd

from jre_utils.datapath import factor_data_paths
from jre_utils.constants import BLACKLISTED_AREAS

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [5]:
taxpayer_unprocessed_path = factor_data_paths["unprocessed"]["taxpayer"]["municipality"]
taxpayer_processed_path = factor_data_paths["processed"]["taxpayer"]["municipality"]
all_area_taxpayer_processed_path = factor_data_paths["processed"]["taxpayer"]["submunicipality"]

In [6]:
taxpayer_df = pd.read_csv(taxpayer_unprocessed_path)

taxpayer_df = taxpayer_df.drop_duplicates(subset=["year", "area_code"])
taxpayer_df = taxpayer_df[~taxpayer_df["area"].isin(BLACKLISTED_AREAS)]

taxpayer_df = taxpayer_df[taxpayer_df["taxpayer_count"].apply(lambda x: x.isdigit())]
taxpayer_df["taxpayer_count"] = taxpayer_df["taxpayer_count"].astype(float)

taxpayer_df = taxpayer_df.sort_values(by=["year", "area_code"]) # for growth calculation
taxpayer_df["taxpayer_count_growth"] = taxpayer_df.groupby("area_code")["taxpayer_count"].pct_change()


In [7]:
log_normalize_columns = [
    "taxpayer_count",
]

normalize_columns = [
    "taxpayer_count_growth",
]

for column in log_normalize_columns:
    taxpayer_df[f"{column}_log"] = taxpayer_df[column].apply(
        lambda x: np.log10(1 + x)
    )
    taxpayer_df[f"{column}_log_normalized_yearly"] = taxpayer_df.groupby(
        "year"
    )[f"{column}_log"].transform(lambda x: (x - x.mean()) / x.std())

for column in normalize_columns:
    taxpayer_df[f"{column}_normalized_yearly"] = taxpayer_df.groupby(
        "year"
    )[column].transform(lambda x: (x - x.mean()) / x.std())

taxpayer_df.to_csv(all_area_taxpayer_processed_path, index=False)


In [8]:
taxpayer_df.to_csv(taxpayer_processed_path, index=False)
taxpayer_df

Unnamed: 0,year,area_code,area,taxpayer_count,taxpayer_count_growth,taxpayer_count_log,taxpayer_count_log_normalized_yearly,taxpayer_count_growth_normalized_yearly
68941,1985,1100,Hokkaido Sapporo-shi,554022.0,,5.743528,2.920035,
68949,1985,1202,Hokkaido Hakodate-shi,113815.0,,5.056203,1.808725,
68950,1985,1203,Hokkaido Otaru-shi,60022.0,,4.778318,1.359422,
68951,1985,1204,Hokkaido Asahikawa-shi,127284.0,,5.104777,1.887262,
68952,1985,1205,Hokkaido Muroran-shi,45941.0,,4.662210,1.171691,
...,...,...,...,...,...,...,...,...
1911,2021,47361,Okinawa-ken Kumejima-cho,2715.0,0.008919,3.433930,-0.820877,0.640958
1912,2021,47362,Okinawa-ken Yaese-cho,12278.0,0.021719,4.089163,0.133185,1.353015
1913,2021,47375,Okinawa-ken Tarama-son,313.0,-0.012618,2.496930,-2.185209,-0.557071
1914,2021,47381,Okinawa-ken Taketomi-cho,1561.0,0.003859,3.193681,-1.170694,0.359481
