In [2]:
import os
import warnings

import pandas as pd

from jre_utils.datapath import factor_data_paths
from jre_utils.config import asset_types, area_levels, period_cols, statistics
from jre_utils.constants import BLACKLISTED_AREAS

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [3]:
period = "yearly"
area_level = "municipality"
asset_type = "building"
statistic = "median"

granularity_columns = area_levels[area_level]["columns"]
group_by_columns = granularity_columns + [period_cols[period]]

population_unprocessed_path = factor_data_paths["unprocessed"]["population"][
    "municipality"
]
migration_unprocessed_path = factor_data_paths["unprocessed"]["migration"][
    "municipality"
]

population_processed_path = factor_data_paths["processed"]["population"]["municipality"]
population_processed_path_all = factor_data_paths["processed"]["population"][
    "submunicipality"
]

migration_processed_path = factor_data_paths["processed"]["migration"]["municipality"]
migration_processed_path_all = factor_data_paths["processed"]["migration"][
    "submunicipality"
]

In [4]:
# Birth Death dataset is limiting since data is apparently limited to 2020

In [5]:
population_df = pd.read_csv(population_unprocessed_path)
migration_df = pd.read_csv(migration_unprocessed_path)

population_df = population_df.drop_duplicates(subset=["year", "area"])
migration_df = migration_df.drop_duplicates(subset=["year", "area"])

population_df = population_df[population_df["year"] == 2020]
combined_df = migration_df.merge(population_df, on=["year", "area"], how="left")

combined_df = combined_df[~combined_df["area"].isin(BLACKLISTED_AREAS)]
combined_df = combined_df.sort_values(by=["area", "year"], ascending=[False, True])

combined_df["population"] = combined_df.groupby("area")["population"].ffill()
combined_df["population"] = combined_df.groupby("area")["population"].bfill()

combined_df = combined_df[combined_df["in_migrations"].apply(lambda x: x.isdigit())]
combined_df = combined_df[combined_df["out_migrations"].apply(lambda x: x.isdigit())]

combined_df["in_migrations"] = combined_df["in_migrations"].astype(float)
combined_df["out_migrations"] = combined_df["out_migrations"].astype(float)

combined_df["net_migrations"] = (
    combined_df["in_migrations"] - combined_df["out_migrations"]
)
combined_df["del_population"] = combined_df[
    "net_migrations"
]  # add other factors to improve estimate

combined_df["net_population_chg"] = combined_df["del_population"]

combined_df.loc[combined_df["year"] == 2020, "del_population"] = 0

combined_df_pre = combined_df[combined_df["year"] <= 2020]
combined_df_pre = combined_df_pre.sort_values(
    by=["year", "area"], ascending=[False, True]
)
combined_df_pre["cumulative_del_population"] = combined_df_pre.groupby("area")[
    "del_population"
].cumsum()
combined_df_pre["population"] = (
    combined_df_pre["population"] - combined_df_pre["cumulative_del_population"]
)

combined_df_post = combined_df[combined_df["year"] > 2020]
combined_df_post = combined_df_post.sort_values(
    by=["year", "area"], ascending=[True, True]
)
combined_df_post["cumulative_del_population"] = combined_df_post.groupby("area")[
    "del_population"
].cumsum()
combined_df_post["population"] = (
    combined_df_post["population"] + combined_df_post["cumulative_del_population"]
)

combined_df = pd.concat(
    [combined_df_pre, combined_df_post], ignore_index=True
).sort_values(by=["year", "area"], ascending=[False, True])

all_area_population_df = combined_df[
    ["year", "area", "population"]
]  # save for future use

combined_df[["Prefecture", "Municipality", "Submunicipality"]] = combined_df[
    "area"
].str.split(" ", expand=True)

combined_df["Prefecture"] = (
    combined_df["Prefecture"].str.split("-").apply(lambda x: x[0])
)
combined_df["Municipality"] = (
    combined_df["Municipality"].str.split("-").apply(lambda x: x[0])
)

combined_df = combined_df[~combined_df["Submunicipality"].notna()]
combined_df = combined_df.drop(columns=["Submunicipality", "cumulative_del_population"])

In [6]:
combined_df["net_migration_ratio"] = (
    combined_df["net_migrations"] / combined_df["population"]
)
migration_df = combined_df[
    ["year", "Prefecture", "Municipality", "net_migration_ratio"]
]
population_df = combined_df[["year", "Prefecture", "Municipality", "population"]]

migration_df.to_csv(migration_processed_path, index=False)
population_df.to_csv(population_processed_path, index=False)
all_area_population_df.to_csv(population_processed_path_all, index=False)

In [7]:
# Different from the one done previously. Understand why
migration_df

Unnamed: 0,year,Prefecture,Municipality,net_migration_ratio
47379,2022,Aichi,Agui,0.001195
47380,2022,Aichi,Aisai,0.001431
47381,2022,Aichi,Ama,0.002314
47382,2022,Aichi,Anjo,-0.006517
47383,2022,Aichi,Chiryu,-0.000139
...,...,...,...,...
45471,1996,Yamanashi,Tabayama,-0.012870
45472,1996,Yamanashi,Tsuru,-0.003942
45473,1996,Yamanashi,Uenohara,0.007068
45474,1996,Yamanashi,Yamanakako,-0.010568
