In [24]:
import os
import warnings

import pandas as pd

from jre_utils.datapath import DATA_DIRECTORY_PATH

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [92]:
revenue_breakdown_directory_path = (
    f"{DATA_DIRECTORY_PATH}/exploratory/LFS_revenue_breakdown"
)
datasets = [
    "2020.csv",
    "2021.csv",
    "2011_2018.csv",
    "2022.csv",
    "1995_2003.csv",
    "2019.csv",
    "2003_2011.csv",
    "1989_1995.csv",
]

In [93]:
datasets

['2020.csv',
 '2021.csv',
 '2011_2018.csv',
 '2022.csv',
 '1995_2003.csv',
 '2019.csv',
 '2003_2011.csv',
 '1989_1995.csv']

In [94]:
dataframes = {}
for dataset in datasets:
    dataframes[dataset.split(".")[0]] = pd.read_csv(f"{revenue_breakdown_directory_path}/{dataset}", encoding="utf-8")

In [95]:
for year in ["2019", "2020", "2021", "2022"]:
    df = dataframes[year]
    df = df.drop_duplicates(subset=["area_code", "year"])
    df = df[df["area_code"].astype(str).apply(lambda x: x.isdigit())]
    df["area_code"] = df["area_code"].astype(int)
    df["area_code"] = df["area_code"] // 10
    dataframes[year] = df

for year in dataframes.keys():
    df = dataframes[year]
    df = df[df["year"].astype(str).apply(lambda x: x.isdigit())]
    df["year"] = df["year"].astype(int)
    dataframes[year] = df

In [96]:
# combined_df= pd.concat([
#     dataframes["2022"],
#     dataframes["2021"],
#     dataframes["2020"],
#     dataframes["2019"],
#     dataframes["2011_2018"],
#     dataframes["2003_2011"],
#     dataframes["1995_2003"],
#     dataframes["1989_1995"],
# ])
combined_df = pd.concat(dataframes.values())


In [97]:
combined_df.sort_values(by=["year", "area_code"], ascending=[False, True], ignore_index=True)

Unnamed: 0,year,area_code,県名,団体名,001:地方税,002:地方譲与税,003:地方揮発油譲与税,007:自動車重量譲与税,009:利子割交付金,010:配当割交付金,011:株式等譲渡所得割交付金,014:地方消費税交付金,015:ゴルフ場利用税交付金,017:自動車取得税交付金,020:法人事業税交付金,030:授業料,035:公営住宅使用料,040:国庫支出金,046:普通建設事業費支出金,団体名(市町村分),21.財産収入,22.寄附金,24.繰越金
0,2022,1100,,札幌市,347606361,5418084,1887987,3208558,100249,730142,588161,52918097,94048,0,4518888,683544,6446755,348870199,16117125,,,,
1,2022,1202,,函館市,32031925,789538,171434,513126,11322,82681,66714,6838038,8288,0,483577,88973,979357,40896004,902354,,,,
2,2022,1203,,小樽市,13845729,324636,72707,217625,4238,30930,24944,3051797,36366,0,223446,0,516328,16788216,673229,,,,
3,2022,1204,,旭川市,40481178,1455912,277956,831962,14517,106401,86056,8856901,14415,0,605729,0,827473,52393788,1428241,,,,
4,2022,1205,,室蘭市,13616569,365034,61457,183955,3907,28539,23028,2330285,6024,0,192531,36523,560699,11092510,705726,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120575,1989,47373,,,118051,36610,***,18572,2032,***,***,***,0,13208,,1978,18650,395882,319762,上野村,18985,0,92722
120576,1989,47374,,,249859,133845,***,26687,3682,***,***,***,0,18963,,4544,26008,456764,265440,伊良部町,15784,7200,41751
120577,1989,47375,,,47701,13566,***,6828,885,***,***,***,0,4847,,2391,2105,142618,100436,多良間村,52,0,41998
120578,1989,47381,,,210682,36095,***,16636,3928,***,***,***,0,11854,,1939,11048,635498,539590,竹富町,29826,2879,75044


In [98]:
combined_df.to_csv(f"{DATA_DIRECTORY_PATH}/exploratory/LFS_revenue_breakdown/combined.csv", index=False)

In [99]:
taxable_income_with_area_code_path = (
    f"{DATA_DIRECTORY_PATH}/exploratory/taxable_income_with_area_code.csv"
)
taxable_income_with_area_code_df = pd.read_csv(taxable_income_with_area_code_path)
taxable_income_with_area_code_df

Unnamed: 0,year,area_code,area,taxable_income
0,2021,1100,Hokkaido Sapporo-shi,2971365734
1,2021,1101,Hokkaido Sapporo-shi Chuo-ku,-
2,2021,1102,Hokkaido Sapporo-shi Kita-ku,-
3,2021,1103,Hokkaido Sapporo-shi Higashi-ku,-
4,2021,1104,Hokkaido Sapporo-shi Shiroishi-ku,-
...,...,...,...,...
70847,1985,47361,Okinawa-ken Kumejima-cho,3992455
70848,1985,47362,Okinawa-ken Yaese-cho,9023653
70849,1985,47375,Okinawa-ken Tarama-son,439590
70850,1985,47381,Okinawa-ken Taketomi-cho,1525627


In [100]:
merged_df = taxable_income_with_area_code_df.merge(combined_df, on=["year", "area_code"])
merged_df

Unnamed: 0,year,area_code,area,taxable_income,県名,団体名,001:地方税,002:地方譲与税,003:地方揮発油譲与税,007:自動車重量譲与税,009:利子割交付金,010:配当割交付金,011:株式等譲渡所得割交付金,014:地方消費税交付金,015:ゴルフ場利用税交付金,017:自動車取得税交付金,020:法人事業税交付金,030:授業料,035:公営住宅使用料,040:国庫支出金,046:普通建設事業費支出金,団体名(市町村分),21.財産収入,22.寄附金,24.繰越金
0,2021,1100,Hokkaido Sapporo-shi,2971365734,北海道,札幌市,334595895,5463147,1987812,3218437,161174,821798,999198,49942134,94871,0,4716562,705973,6632731,420192265,20780650,,,,
1,2021,1202,Hokkaido Hakodate-shi,319440182,北海道,函館市,31267532,789329,179704,513802,18082,92378,112427,6710399,9233,0,403348,93976,999094,40769199,583487,,,,
2,2021,1203,Hokkaido Otaru-shi,124333264,北海道,小樽市,13937289,329168,76401,218444,6788,34642,42137,3052795,34371,0,195884,0,530833,19943165,592713,,,,
3,2021,1204,Hokkaido Asahikawa-shi,419350956,北海道,旭川市,39773268,1445529,291545,833585,23002,117678,143326,8559487,14566,0,506091,0,853719,59545259,680216,,,,
4,2021,1205,Hokkaido Muroran-shi,105208648,北海道,室蘭市,13114293,373608,64570,184617,6242,31889,38811,2304185,6121,0,175870,34951,570394,12850290,1513192,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57448,1989,47361,Okinawa-ken Kumejima-cho,4199487,,,***,***,***,***,***,***,***,***,***,***,,***,***,***,***,久米島町,***,***,***
57449,1989,47362,Okinawa-ken Yaese-cho,11031447,,,***,***,***,***,***,***,***,***,***,***,,***,***,***,***,八重瀬町,***,***,***
57450,1989,47375,Okinawa-ken Tarama-son,501040,,,47701,13566,***,6828,885,***,***,***,0,4847,,2391,2105,142618,100436,多良間村,52,0,41998
57451,1989,47381,Okinawa-ken Taketomi-cho,1810137,,,210682,36095,***,16636,3928,***,***,***,0,11854,,1939,11048,635498,539590,竹富町,29826,2879,75044


In [1]:
merged_df.sample(n=100).plot(x="taxable_income", y="001:地方税", kind="scatter")

NameError: name 'merged_df' is not defined

In [14]:
merged_df = merged_df[merged_df["taxable_income"].apply(lambda x: x.isdigit())]
merged_df["taxable_income"] = merged_df["taxable_income"].astype(float)

merged_df = merged_df[merged_df["001:地方税"].apply(lambda x: x.isdigit())]
merged_df["001:地方税"] = merged_df["001:地方税"].astype(float)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115308 entries, 0 to 125351
Data columns (total 25 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   year             115308 non-null  int64  
 1   area_code        115308 non-null  int64  
 2   area             115308 non-null  object 
 3   taxable_income   115308 non-null  float64
 4   県名               20892 non-null   object 
 5   団体名              20892 non-null   object 
 6   001:地方税          115308 non-null  float64
 7   002:地方譲与税        115308 non-null  object 
 8   003:地方揮発油譲与税     115308 non-null  object 
 9   007:自動車重量譲与税     115308 non-null  object 
 10  009:利子割交付金       115308 non-null  object 
 11  010:配当割交付金       115308 non-null  object 
 12  011:株式等譲渡所得割交付金  115308 non-null  object 
 13  014:地方消費税交付金     115308 non-null  object 
 14  015:ゴルフ場利用税交付金   115308 non-null  object 
 15  017:自動車取得税交付金    115308 non-null  object 
 16  020:法人事業税交付金     13928 non-null   object 
 