In [124]:
import os
import warnings

import pandas as pd
import numpy as np

from jre_utils.datapath import factor_data_paths, DATA_DIRECTORY_PATH

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [125]:
dataset = "lfs_revenue_breakdown"
area = "municipality"

df = pd.read_csv(factor_data_paths["unprocessed"][dataset][area])

In [126]:
import json
from pprint import pprint

area_code_to_area_path = f"{DATA_DIRECTORY_PATH}/core_scraped/area_code_to_area.json"
with open(area_code_to_area_path) as fd:
     area_code_to_area = json.load(fd)
     pprint(f"E.g. Maps 1100 to {area_code_to_area["1100"]}") 

def get_area_from_area_code(area_code):
     return area_code_to_area.get(str(area_code), "na" )

'E.g. Maps 1100 to Hokkaido Sapporo-shi'


In [127]:
df["area"] = df["area_code"].apply(get_area_from_area_code)

In [129]:
rename_dict = {
    "001:地方税": "total_tax",
    # "002:地方譲与税": "tax_allowance",
    # "003:地方揮発油譲与税": "gasoline_tax_allowance",
    # "007:自動車重量譲与税": "automobile_weight_tax_allowance",
    # "009:利子割交付金": "interest_discount_allowance",
    # "010:配当割交付金": "dividend_allowance",
    # "011:株式等譲渡所得割交付金": "capital_gains_allowance",
    # "014:地方消費税交付金": "local_consumption_tax_subsidy",
    # "015:ゴルフ場利用税交付金": "golf_course_usage_tax_subsidy",
    # "017:自動車取得税交付金": "automobile_acquisition_tax_subsidy",
    # "030:授業料": "tuition_fee",
    # "035:公営住宅使用料": "public_housing_usage_fee",
    # "040:国庫支出金": "national_treasury_expenditure",
    # "046:普通建設事業費支出金": "ordinary_construction_expenditure",
    # "21.財産収入": "property_income",
    # "22.寄附金": "donation",
    # "24.繰越金": "carryover_funds",
}

df = df[["year", "area_code", "area"] + list(rename_dict.keys())]
df = df.rename(columns = rename_dict)


In [130]:
df = df.fillna("***")
df = df.sort_values(["year", "area_code"])
for column in rename_dict.values():
    df[column] = np.where(df[column].apply(lambda x: x.isdigit()), df[column], np.NaN)
    df[column] = df[column].astype(float)
    df[f"{column}_growth"] = df.groupby("area_code")[column].pct_change()

    # df[f"{column}_is_available"] = df[column].notnull()

In [131]:
df[df["area_code"] == 13101][["area", "area_code"]]

Unnamed: 0,area,area_code
118063,Tokyo-to Chiyoda-ku,13101
114458,Tokyo-to Chiyoda-ku,13101
110853,Tokyo-to Chiyoda-ku,13101
107248,Tokyo-to Chiyoda-ku,13101
103643,Tokyo-to Chiyoda-ku,13101
100038,Tokyo-to Chiyoda-ku,13101
96433,Tokyo-to Chiyoda-ku,13101
61916,Tokyo-to Chiyoda-ku,13101
58311,Tokyo-to Chiyoda-ku,13101
54706,Tokyo-to Chiyoda-ku,13101


In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120580 entries, 116975 to 37071
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   year              120580 non-null  int64  
 1   area_code         120580 non-null  int64  
 2   area              120580 non-null  object 
 3   total_tax         88416 non-null   float64
 4   total_tax_growth  106433 non-null  float64
dtypes: float64(2), int64(2), object(1)
memory usage: 5.5+ MB


In [133]:
df["total_tax"].value_counts()

total_tax
0.0         5466
314493.0       3
474547.0       3
837918.0       3
127282.0       3
            ... 
161656.0       1
22770.0        1
504412.0       1
441182.0       1
221959.0       1
Name: count, Length: 82154, dtype: int64

In [134]:
df.to_csv(factor_data_paths["processed"][dataset][area], index = False)