In [1]:
import warnings

import numpy as np
import pandas as pd

from jre_utils.datapath import factor_data_paths
from jre_utils.constants import BLACKLISTED_AREAS

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
population_unprocessed_path = factor_data_paths["unprocessed"]["population"][
    "municipality"
]
migration_unprocessed_path = factor_data_paths["unprocessed"]["migration"][
    "municipality"
]

population_processed_path = factor_data_paths["processed"]["population"]["municipality"]
population_processed_path_all = factor_data_paths["processed"]["population"][
    "submunicipality"
]

migration_processed_path = factor_data_paths["processed"]["migration"]["municipality"]
migration_processed_path_all = factor_data_paths["processed"]["migration"][
    "submunicipality"
]

In [3]:
population_df = pd.read_csv(population_unprocessed_path)
migration_df = pd.read_csv(migration_unprocessed_path)

In [4]:
migration_df[migration_df["area_code"] == 1101]

Unnamed: 0,year,area_code,area,in_migrations,out_migrations
1,2022,1101,Hokkaido Sapporo-shi Chuo-ku,21118.0,17618.0
1917,2021,1101,Hokkaido Sapporo-shi Chuo-ku,20319.0,17920.0
3833,2020,1101,Hokkaido Sapporo-shi Chuo-ku,20653.0,18188.0
5749,2019,1101,Hokkaido Sapporo-shi Chuo-ku,20954.0,17829.0
7665,2018,1101,Hokkaido Sapporo-shi Chuo-ku,19983.0,18176.0
9581,2017,1101,Hokkaido Sapporo-shi Chuo-ku,20542.0,18736.0
11497,2016,1101,Hokkaido Sapporo-shi Chuo-ku,20274.0,18490.0
13413,2015,1101,Hokkaido Sapporo-shi Chuo-ku,21434.0,18133.0
15329,2014,1101,Hokkaido Sapporo-shi Chuo-ku,20773.0,18089.0
17245,2013,1101,Hokkaido Sapporo-shi Chuo-ku,21575.0,18531.0


In [5]:
population_df = pd.read_csv(population_unprocessed_path)
migration_df = pd.read_csv(migration_unprocessed_path)

population_df = population_df.drop_duplicates(subset=["year", "area_code"])
migration_df = migration_df.drop_duplicates(subset=["year", "area_code"])

population_df = population_df[population_df["year"] == 2020]
combined_df = migration_df.merge(population_df, on=["year", "area_code"], suffixes=["", "_y"], how="left")

combined_df = combined_df[~combined_df["area"].isin(BLACKLISTED_AREAS)]
combined_df = combined_df.sort_values(by=["area_code", "year"], ascending=[False, True])

combined_df["population"] = combined_df.groupby("area_code")["population"].ffill()
combined_df["population"] = combined_df.groupby("area_code")["population"].bfill()

# combined_df = combined_df[combined_df["in_migrations"].apply(lambda x: x.isdigit())]
# combined_df = combined_df[combined_df["out_migrations"].apply(lambda x: x.isdigit())]

# combined_df["in_migrations"] = combined_df["in_migrations"].astype(float)
# combined_df["out_migrations"] = combined_df["out_migrations"].astype(float)

combined_df["net_migrations"] = (
    combined_df["in_migrations"] - combined_df["out_migrations"]
)
combined_df["del_population"] = combined_df[
    "net_migrations"
]  # add other factors to improve estimate

combined_df["net_population_chg"] = combined_df["del_population"]

combined_df.loc[combined_df["year"] == 2020, "del_population"] = 0

combined_df_pre = combined_df[combined_df["year"] <= 2020]
combined_df_pre = combined_df_pre.sort_values(
    by=["year", "area_code"], ascending=[False, True]
)
combined_df_pre["cumulative_del_population"] = combined_df_pre.groupby("area_code")[
    "del_population"
].cumsum()
combined_df_pre["population"] = (
    combined_df_pre["population"] - combined_df_pre["cumulative_del_population"]
)

combined_df_post = combined_df[combined_df["year"] > 2020]
combined_df_post = combined_df_post.sort_values(
    by=["year", "area_code"], ascending=[True, True]
)
combined_df_post["cumulative_del_population"] = combined_df_post.groupby("area_code")[
    "del_population"
].cumsum()
combined_df_post["population"] = (
    combined_df_post["population"] + combined_df_post["cumulative_del_population"]
)

combined_df = pd.concat(
    [combined_df_pre, combined_df_post], ignore_index=True
).sort_values(by=["year", "area_code"], ascending=[False, True])

all_area_population_df = combined_df[
    ["year", "area_code", "area", "population"]
]  # save for future use

combined_df = combined_df.drop(columns=["cumulative_del_population"])

In [6]:
combined_df["net_migration_ratio"] = (
    combined_df["net_migrations"] / combined_df["population"]
)
migration_df = combined_df[
    ["year", "area_code", "area", "in_migrations", "out_migrations", "net_migration_ratio"]
]
population_df = combined_df[["year", "area_code", "area", "population"]]

# all_area_population_df.to_csv(population_processed_path_all, index=False)

In [23]:
log_normalize_columns = [
    "in_migrations",
    "out_migrations",
    "net_migration_ratio",

]

normalize_columns = [
    "net_migration_ratio",
]

for column in log_normalize_columns:
    migration_df[f"{column}_log"] = migration_df[column].apply(
        lambda x: np.log10(1 + x)
    )
    migration_df[f"{column}_log_normalized_yearly"] = migration_df.groupby(
        "year"
    )[f"{column}_log"].transform(lambda x: (x - x.mean()) / x.std())

for column in normalize_columns:
    migration_df[f"{column}_normalized_yearly"] = migration_df.groupby("year")[
        column
    ].transform(lambda x: (x - x.mean()) / x.std())


migration_df.to_csv(migration_processed_path, index=False)


In [24]:
log_normalize_columns = [
    "population",
]

normalize_columns = [
]

for column in log_normalize_columns:
    population_df[f"{column}_log"] = population_df[column].apply(
        lambda x: np.log10(1 + x)
    )
    population_df[f"{column}_log_normalized_yearly"] = population_df.groupby(
        "year"
    )[f"{column}_log"].transform(lambda x: (x - x.mean()) / x.std())


population_df.to_csv(population_processed_path, index=False)


In [25]:
migration_df

Unnamed: 0,year,area_code,area,in_migrations,out_migrations,net_migration_ratio,in_migrations_log,in_migrations_log_normalized_yearly,out_migrations_log,out_migrations_log_normalized_yearly,net_migration_ratio_normalized_yearly
49789,2022,1100,Hokkaido Sapporo-shi,114194.0,105217.0,0.004506,5.057647,2.927897,5.022090,2.981379,0.720856
49790,2022,1101,Hokkaido Sapporo-shi Chuo-ku,21118.0,17618.0,0.013748,4.324673,1.932639,4.245981,1.864115,1.590241
49791,2022,1102,Hokkaido Sapporo-shi Kita-ku,14532.0,13683.0,0.002911,4.162355,1.712238,4.136213,1.706096,0.570816
49792,2022,1103,Hokkaido Sapporo-shi Higashi-ku,14109.0,13484.0,0.002341,4.149527,1.694820,4.129851,1.696937,0.517158
49793,2022,1104,Hokkaido Sapporo-shi Shiroishi-ku,13322.0,12717.0,0.002847,4.124602,1.660976,4.104419,1.660326,0.564773
...,...,...,...,...,...,...,...,...,...,...,...
47869,1996,47361,Okinawa-ken Kumejima-cho,549.0,619.0,-0.007292,2.740363,-0.531757,2.792392,-0.456586,-0.429480
47870,1996,47362,Okinawa-ken Yaese-cho,1539.0,1212.0,0.012110,3.187521,0.126119,3.083861,-0.025347,1.029213
47871,1996,47375,Okinawa-ken Tarama-son,52.0,98.0,-0.031144,1.724276,-2.026663,1.995635,-1.635414,-2.222788
47872,1996,47381,Okinawa-ken Taketomi-cho,430.0,441.0,-0.002947,2.634477,-0.687540,2.645422,-0.674032,-0.102871


In [32]:
combined_df[combined_df["area_code"] == 1100]

Unnamed: 0,year,area_code,area,in_migrations,out_migrations,area_y,population,net_migrations,del_population,net_population_chg,net_migration_ratio
49789,2022,1100,Hokkaido Sapporo-shi,114194.0,105217.0,,1992048.0,8977.0,8977.0,8977.0,0.004506
47874,2021,1100,Hokkaido Sapporo-shi,115389.0,105713.0,,1983071.0,9676.0,9676.0,9676.0,0.004879
0,2020,1100,Hokkaido Sapporo-shi,117323.0,106924.0,Hokkaido Sapporo-shi,1973395.0,10399.0,0.0,10399.0,0.00527
1915,2019,1100,Hokkaido Sapporo-shi,117964.0,108578.0,,1964009.0,9386.0,9386.0,9386.0,0.004779
3830,2018,1100,Hokkaido Sapporo-shi,117434.0,109504.0,,1956079.0,7930.0,7930.0,7930.0,0.004054
5745,2017,1100,Hokkaido Sapporo-shi,119314.0,110535.0,,1947300.0,8779.0,8779.0,8779.0,0.004508
7660,2016,1100,Hokkaido Sapporo-shi,118336.0,109199.0,,1938163.0,9137.0,9137.0,9137.0,0.004714
9575,2015,1100,Hokkaido Sapporo-shi,120932.0,112759.0,,1929990.0,8173.0,8173.0,8173.0,0.004235
11490,2014,1100,Hokkaido Sapporo-shi,119304.0,110941.0,,1921627.0,8363.0,8363.0,8363.0,0.004352
13405,2013,1100,Hokkaido Sapporo-shi,123103.0,113015.0,,1911539.0,10088.0,10088.0,10088.0,0.005277
