# 実行環境の作成

## Google Cloud Storage

下記コードでGCPに接続

In [1]:
from google.colab import auth
auth.authenticate_user()

認証に成功したらgcsfuseをインストール

In [2]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt-get -y -q update
!apt-get -y -q install gcsfuse

deb http://packages.cloud.google.com/apt gcsfuse-bionic main
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2537  100  2537    0     0  68567      0 --:--:-- --:--:-- --:--:-- 68567
OK
Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:5 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:8 http://packages.cloud.google.com/apt gcsfuse-bionic InRelease [5,391 B]
Hit:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubunt

バケット「statistics-hyogo」をディレクトリ「statistics-hyogo」にマウント

In [3]:
! mkdir -p statistics-hyogo
! gcsfuse --implicit-dirs --limit-bytes-per-sec -1 --limit-ops-per-sec -1 statistics-hyogo statistics-hyogo

2022/09/16 07:11:54.267718 Start gcsfuse/0.41.6 (Go version go1.18.4) for app "" using mount point: /content/statistics-hyogo
2022/09/16 07:11:54.285039 Opening GCS connection...
2022/09/16 07:11:55.783439 Mounting file system "statistics-hyogo"...
2022/09/16 07:11:55.784206 File system has been successfully mounted.


## ESTAT_APP_ID

In [4]:
ESTAT_APPID = '724e5b90772a3e9289f41a253e4e7e32438f4fff'

## pythonのライブラリ追加

In [5]:
import urllib.parse
import urllib.request
import json
import pandas as pd

# 統計カード管理情報を取得・加工

## BigQueryから統計カード管理情報を読み込む

pandas-gbqのインストール

In [None]:
!pip install pandas-gbq

BigQueryをDataFrameで取得

In [35]:
def get_cards_management_dataframe():
  # プロジェクトの定義
  project_id='primal-buttress-342908'
  dataset_id = 'contents'
  table_id='cards'

  # クエリ
  query = f"""
  SELECT *
  FROM {dataset_id}.{table_id}
  """

  # dialect='standard' で標準SQLを使用
  df = pd.read_gbq(query, project_id, dialect='standard')

  return df

SyntaxError: ignored

In [33]:
df_cardmng = get_cards_management_dataframe()
df_cardmng

Unnamed: 0,index,statsDataId,statsDataName,categoryCode,categoryName,governmentType,cardId,cardTitle,chartComponent,isSelect,type,yAxis
0,3,10101,社会・人口統計体系,A1102,日本人人口,prefecture,,,,,,
1,4,10101,社会・人口統計体系,A110201,日本人人口（男）,prefecture,,,,,,
2,5,10101,社会・人口統計体系,A110202,日本人人口（女）,prefecture,,,,,,
3,6,10101,社会・人口統計体系,A1201,0～4歳人口,prefecture,,,,,,
4,7,10101,社会・人口統計体系,A120101,0～4歳人口（男）,prefecture,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
563,550,10101,社会・人口統計体系,A840201,母子世帯数（15～24歳）,prefecture,single-mother-household-prefecture,都道府県の母子世帯数,TimeChart,,column,0
564,551,10101,社会・人口統計体系,A840202,母子世帯数（25～34歳）,prefecture,single-mother-household-prefecture,都道府県の母子世帯数,TimeChart,,column,0
565,552,10101,社会・人口統計体系,A840203,母子世帯数（35～44歳）,prefecture,single-mother-household-prefecture,都道府県の母子世帯数,TimeChart,,column,0
566,553,10101,社会・人口統計体系,A840204,母子世帯数（45～54歳）,prefecture,single-mother-household-prefecture,都道府県の母子世帯数,TimeChart,,column,0


## 統計カードリストの作成

統計カード（cardId,cardTitle)のリストを作成する

In [12]:
def get_cardliist(df_cardmng):
  #  'cardId'の指定があるデータを抽出
  df = df_cardmng.dropna(subset=['cardId'])
  
  # GroupBy
  df = df.groupby(['cardId','cardTitle','governmentType','statsDataId']).agg(
    {'categoryCode': lambda x: ','.join(x)
    })
  
  return df

In [13]:
df_cardlist = get_cardliist(df_cardmng)
df_cardlist

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,categoryCode
cardId,cardTitle,governmentType,statsDataId,Unnamed: 4_level_1
divorce-prefecture,都道府県の離婚件数,prefecture,10101,A9201
first-marriage-age-prefecture,都道府県の平均初婚年齢,prefecture,10101,"A9111,A9112"
household-prefecture,都道府県の一般世帯数,prefecture,10101,A710101
separated-prefecture,都道府県の離別人口,prefecture,10101,"A1604011,A1604012,A1604021,A1604022,A1604031,A..."
single-mother-household-prefecture,都道府県の母子世帯数,prefecture,10101,"A8401,A840101,A840102,A840103,A840201,A840202,..."
spouse-prefecture,都道府県の有配偶人口,prefecture,10101,"A1602021,A1602022,A1602031,A1602032,A1602041,A..."
standardized-mortality-prefecture,都道府県の標準化死亡率,prefecture,10101,A4301
total-population-prefecture,都道府県の総人口,prefecture,10101,"A1101,A110101,A110102"
unmarried-prefecture,都道府県の未婚人口,prefecture,10101,"A1601011,A1601012,A1601021,A1601022,A1601031,A..."


# 統計カードのデータセット

## 都道府県リスト

都道府県コード一覧

In [20]:
def get_pref_codes(resType='num'):
  c = 'statistics-hyogo/preflist.json'
  with open(c) as j:
    prefList = json.load(j)
    pref_codes = [d.get('prefCode') for d in prefList['result']]
  
  if resType == 'str':
    return [f'{n:02}'+'000' for n in pref_codes]
  else:
    return pref_codes

In [22]:
print(get_pref_codes())
print(get_pref_codes('str'))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]
['01000', '02000', '03000', '04000', '05000', '06000', '07000', '08000', '09000', '10000', '11000', '12000', '13000', '14000', '15000', '16000', '17000', '18000', '19000', '20000', '21000', '22000', '23000', '24000', '25000', '26000', '27000', '28000', '29000', '30000', '31000', '32000', '33000', '34000', '35000', '36000', '37000', '38000', '39000', '40000', '41000', '42000', '43000', '44000', '45000', '46000', '47000']


## estat-APIのパラメータ生成

'statsDataId'はdataframeからcsvコピーした際に数値に変換されてしまう。

In [40]:
df_cardmng = get_cards_management_dataframe()
df_cardlist = get_cardliist(df_cardmng)

for index, row in df_cardlist.iterrows():
  dict = {}
  
  # statsDataId  csvコピーした際に数値変換されている
  statsDataId = index[3] if type(index[3]) == 'int' else str(index[3]).zfill(10)
  dict['statsDataId'] = statsDataId
  
  # cdCat01
  dict['cdCat01'] = row['categoryCode']

  # cdArea
  dict['cdCat01'] = row['categoryCode']

  print(index)
  print(row)
  print(dict)

('divorce-prefecture', '都道府県の離婚件数', 'prefecture', 10101)
categoryCode    A9201
Name: (divorce-prefecture, 都道府県の離婚件数, prefecture, 10101), dtype: object
{'statsDataId': '0000010101', 'cdCat01': 'A9201'}
('first-marriage-age-prefecture', '都道府県の平均初婚年齢', 'prefecture', 10101)
categoryCode    A9111,A9112
Name: (first-marriage-age-prefecture, 都道府県の平均初婚年齢, prefecture, 10101), dtype: object
{'statsDataId': '0000010101', 'cdCat01': 'A9111,A9112'}
('household-prefecture', '都道府県の一般世帯数', 'prefecture', 10101)
categoryCode    A710101
Name: (household-prefecture, 都道府県の一般世帯数, prefecture, 10101), dtype: object
{'statsDataId': '0000010101', 'cdCat01': 'A710101'}
('separated-prefecture', '都道府県の離別人口', 'prefecture', 10101)
categoryCode    A1604011,A1604012,A1604021,A1604022,A1604031,A...
Name: (separated-prefecture, 都道府県の離別人口, prefecture, 10101), dtype: object
{'statsDataId': '0000010101', 'cdCat01': 'A1604011,A1604012,A1604021,A1604022,A1604031,A1604032,A1604041,A1604042,A1604051,A1604052,A1604061,A1604062,