# 実行環境の作成

## Google Drive

Google Driveをマウント

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Google Sheets

ライブラリのインストール

In [3]:
!pip install --upgrade -q gspread

Google Sheetsの認証

In [4]:
from google.colab import auth
from google.auth import default
import gspread

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

## Google Cloud Storage

下記コードでGCPに接続

In [5]:
from google.colab import auth
auth.authenticate_user()

認証に成功したらgcsfuseをインストール

In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt-get -y -q update
!apt-get -y -q install gcsfuse

バケット「statistics-hyogo」をディレクトリ「statistics-hyogo」にマウント

In [None]:
! mkdir -p statistics-hyogo
! gcsfuse --implicit-dirs --limit-bytes-per-sec -1 --limit-ops-per-sec -1 statistics-hyogo statistics-hyogo

## ESTAT_APP_ID

In [8]:
ESTAT_APPID = '724e5b90772a3e9289f41a253e4e7e32438f4fff'

## pythonのライブラリ追加

In [9]:
import urllib.parse
import urllib.request
import json
import pandas as pd

# 統計カード管理情報を取得・加工

## BigQueryから統計カード管理情報を読み込む

pandas-gbqのインストール

In [None]:
!pip install pandas-gbq

BigQueryをDataFrameで取得

In [11]:
def get_cards_management_dataframe():
  # プロジェクトの定義
  project_id='primal-buttress-342908'
  dataset_id = 'contents'
  table_id='cards'

  # クエリ
  query = f"""
  SELECT *
  FROM {dataset_id}.{table_id}
  """

  # dialect='standard' で標準SQLを使用
  df = pd.read_gbq(query, project_id, dialect='standard')

  return df

In [None]:
df_cardmng = get_cards_management_dataframe()
df_cardmng

## 統計カードリストの作成

統計カード（cardId,cardTitle)のリストを作成する

In [13]:
def get_cardliist(df_cardmng):
  #  'cardId'の指定があるデータを抽出
  df = df_cardmng.dropna(subset=['cardId'])
  
  # GroupBy
  df = df.groupby(['cardId','cardTitle','governmentType','statsDataId']).agg(
    {'categoryCode': lambda x: ','.join(x)
    })
  
  return df

In [14]:
df_cardlist = get_cardliist(df_cardmng)
df_cardlist

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,categoryCode
cardId,cardTitle,governmentType,statsDataId,Unnamed: 4_level_1
divorce-city,市区町村の離婚件数,city,20201,A9201
divorce-prefecture,都道府県の離婚件数,prefecture,10101,A9201
first-marriage-age-prefecture,都道府県の平均初婚年齢,prefecture,10101,"A9111,A9112"
household-prefecture,都道府県の一般世帯数,prefecture,10101,A710101
separated-prefecture,都道府県の離別人口,prefecture,10101,"A1604011,A1604012,A1604021,A1604022,A1604031,A..."
single-mother-household-city,市区町村の母子世帯数,city,20201,"A8401,A840101,A840102,A840103"
single-mother-household-prefecture,都道府県の母子世帯数,prefecture,10101,"A8401,A840101,A840102,A840103,A840201,A840202,..."
spouse-prefecture,都道府県の有配偶人口,prefecture,10101,"A1602021,A1602022,A1602031,A1602032,A1602041,A..."
standardized-mortality-prefecture,都道府県の標準化死亡率,prefecture,10101,A4301
total-population-city,市区町村の総人口,city,20201,"A1101,A110101,A110102"


# 統計カードのデータセット

## 地域コード一覧

In [15]:
RESAS_API_KEY = '	02JYzLqUOfTNVfiTWjLlR2g0YwEQPQ7caYxcnZaZ'

都道府県一覧

In [16]:
import json
import urllib.request

def get_resas_preflist():
    url = 'https://opendata.resas-portal.go.jp/api/v1/prefectures'
    req = urllib.request.Request(url, headers={'X-API-KEY': RESAS_API_KEY})
    with urllib.request.urlopen(req) as response:
        res = json.loads(response.read().decode())
    return res['result']

市区町村一覧

In [17]:
import json
import urllib.request

def get_resas_citylist(prefCode=0,designatedCity='all'):
  
    # 都道府県'prefCode'を指定
    # 0の場合は全都道府県の市区町村
    if prefCode == 0:
      url = 'https://opendata.resas-portal.go.jp/api/v1/cities'
    else:
      url = 'https://opendata.resas-portal.go.jp/api/v1/cities?prefCode={}'.format(prefCode)
    
    req = urllib.request.Request(url, headers={'X-API-KEY': RESAS_API_KEY})
    with urllib.request.urlopen(req) as response:
        res = json.loads(response.read().decode())


    # 政令指定都市　の指定　'join':政令指定都市統合　'split'：政令指定都市分割 'all'：全て
    if designatedCity == 'join':
      return list(filter(lambda x: x['bigCityFlag'] != '1', res['result']))
    elif designatedCity == 'split':
      return list(filter(lambda x: x['bigCityFlag'] != '2', res['result']))
    else:
      return res['result']

地域コード一覧を取得する関数

In [18]:
import json
import urllib.request

def generate_estat_areas(governmentType='prefecture',prefCode=0):

  # 都道府県の場合
  if governmentType == 'prefecture':
    pref_codes = [d.get('prefCode') for d in get_resas_preflist()]
    return [f'{n:02}'+'000' for n in pref_codes]
  
  # 市区町村の場合
  else:
    city_codes = [d.get('cityCode') for d in get_resas_citylist(prefCode)]
    return city_codes

北海道の市区町村数が１００以上あるので分割する

In [None]:
def generate_estat_city_areas():
  prefCodes = [d.get('prefCode') for d in get_resas_preflist()]

  res = []
  for prefCode in prefCodes:
    
    city_codes = [d.get('cityCode') for d in get_resas_citylist(prefCode)]

    # 北海道の場合
    if prefCode == 1 :
      dict1 = {
          'prefCode': prefCode ,
          'areaCodes':city_codes[:99],
      }
      dict2 = {
          'prefCode': prefCode ,
          'areaCodes':city_codes[99:],
      }
      res.append(dict1)
      res.append(dict2)

    # 北海道以外の場合
    else:
      dict = {
          'prefCode': prefCode ,
          'areaCodes':city_codes,
      }
      res.append(dict)
  
  return res

## estat-APIのレスポンスをファイルに保存する関数

In [78]:
def get_estat_response(params):
  p = params.copy()

  # appId
  p['appId'] = ESTAT_APPID
  
  # url生成
  url = 'http://api.e-stat.go.jp/rest/2.1/app/json/getStatsData?'
  url += urllib.parse.urlencode(p)

  with urllib.request.urlopen(url) as response:
    return json.loads(response.read().decode('utf-8'))

In [79]:
def conv_estat_response_to_dataframe(response):
  # CLASS_INF
  CLASS_OBJ = response['GET_STATS_DATA']['STATISTICAL_DATA']['CLASS_INF']['CLASS_OBJ']

  # TABLE_INF
  TABLE_INF = response['GET_STATS_DATA']['STATISTICAL_DATA']['TABLE_INF']

  # VALUE
  VALUE = response['GET_STATS_DATA']['STATISTICAL_DATA']['DATA_INF']['VALUE']

  # VALUEをDataFrameに変換
  df_res = pd.json_normalize(VALUE)

  # CLASS_OBJのDataFrameを結合
  for d in CLASS_OBJ :
    # DataFrameに変換 '@code','@name'だけ抽出
    df_class = pd.json_normalize(d['CLASS']) 
    df_class = df_class.copy()[['@code','@name']]

    # @codeをキー名に変更 @nameをキー名＋'_name'に変更
    key_name = '@{}'.format(d['@id'])
    df_class = df_class.rename(columns={'@code':key_name,'@name': key_name+'_name'})

    # DataFrameを結合
    df_res = pd.merge(df_res, df_class, on=key_name, how='outer')

  # 統計情報を追加
  df_res['statsDataId'] = TABLE_INF['@id']
  df_res['statsDataName'] = TABLE_INF['STAT_NAME']['$']
  
  return df_res

In [80]:
def  format_estat_dataframe(df_arg):
  # 必要な列だけ抽出
  df_res =  df_arg[['statsDataId','statsDataName','@cat01','@cat01_name','@time','@time_name','@area','@area_name','$','@unit']]

  # 列名の変更
  columns = {'@cat01':'categoryCode','@cat01_name':'categoryName','@time':'timeCode','@time_name':'timeName','@area':'areaCode','@area_name':'areaName','$':'value','@unit':'unit'}
  df_res = df_res.rename(columns=columns)

  # 欠損データ削除
  df_res = df_res.dropna()

  # '-'を'0'に置換
  df_res = df_res.replace('-', '0')

  # categoryNameから不要な情報（categoryCode）を削除
  df_res['categoryName'] = df_res.apply(lambda x: x['categoryName'].replace(x['categoryCode']+'_', ''), 1)

  # timeCodeを文字列4桁に置換
  df_res['timeCode'] = df_res.apply(lambda x: x['timeCode'][:4], 1)

  return df_res

In [81]:
def format_estat_dataframe_withrank(df_arg):

  # 年度リストを作成
  times = df_arg['timeCode'].tolist()
  times = list(set(times))

  # 返却するDataFrameの定義
  df_res = pd.DataFrame(index=[], columns=[])

  # 年度毎に順位を付与
  for time in times:

    # 単年度のdataを抽出
    df_time = df_arg.copy()[df_arg['timeCode'] == time]

    # valueを数値に変換して、'rank'を付与
    df_time = df_time.astype({'value': float})
    df_time['rank'] = df_time.rank(ascending=False)['value'].astype(int).astype(str)

    # 結合
    df_res = pd.concat([df_res, df_time])
    df_res = df_res.astype({'value': str})

  return df_res

In [None]:
def get_estat_dataframe(params):
  # estat-APIのレスポンス取得
  res = get_estat_response(params)
  # print(res)

  # レスポンスをDataFrameに変換
  df = conv_estat_response_to_dataframe(res)

  # DataFrameを整形
  df = format_estat_dataframe(df)

  # 順位を付与
  df = format_estat_dataframe_withrank(df)

  return df

## 繰り返し処理

estat-APIのパラメータ'cdArea'を作成する関数

北海道は市区町村が100以上あるので、2つに分割する

返却値はリスト

In [46]:
def set_estat_params_area(governmentType='prefecture',prefCode=0):

  res=[]

  # 都道府県
  if governmentType == 'prefecture':
    pref_codes = [d.get('prefCode') for d in get_resas_preflist()]
    pref_codes = [str(prefCode).zfill(5) for n in pref_codes]

    dic = {'cdArea':",".join(generate_estat_areas())}
    res.append(dic)
  
  # 市区町村
  if governmentType == 'city':
    citylist = get_resas_citylist(prefCode=prefCode,designatedCity='all')
    city_codes = [d.get('cityCode') for d in citylist]

    # 市区町村が100以上ある場合
    if len(city_codes) >= 100 :
      for i in range(0, len(city_codes), 100):
        dic = {'cdArea':",".join(city_codes[i: i+100])}
        res.append(dic)

    # 市区町村が100未満の場合
    else:
      dic = {'cdArea':",".join(city_codes)}
      res.append(dic)

  return res

In [49]:
print(set_estat_params_area())
print(set_estat_params_area(governmentType='city',prefCode=1))
print(set_estat_params_area(governmentType='city',prefCode=28))

[{'cdArea': '01000,02000,03000,04000,05000,06000,07000,08000,09000,10000,11000,12000,13000,14000,15000,16000,17000,18000,19000,20000,21000,22000,23000,24000,25000,26000,27000,28000,29000,30000,31000,32000,33000,34000,35000,36000,37000,38000,39000,40000,41000,42000,43000,44000,45000,46000,47000'}]
[{'cdArea': '01100,01101,01102,01103,01104,01105,01106,01107,01108,01109,01110,01202,01203,01204,01205,01206,01207,01208,01209,01210,01211,01212,01213,01214,01215,01216,01217,01218,01219,01220,01221,01222,01223,01224,01225,01226,01227,01228,01229,01230,01231,01233,01234,01235,01236,01303,01304,01331,01332,01333,01334,01337,01343,01345,01346,01347,01361,01362,01363,01364,01367,01370,01371,01391,01392,01393,01394,01395,01396,01397,01398,01399,01400,01401,01402,01403,01404,01405,01406,01407,01408,01409,01423,01424,01425,01427,01428,01429,01430,01431,01432,01433,01434,01436,01437,01438,01452,01453,01454,01455'}, {'cdArea': '01456,01457,01458,01459,01460,01461,01462,01463,01464,01465,01468,01469,01

'path'と'params'のリストを作成する。
'statsDataId'はdataframeからcsvコピーした際に数値に変換されてしまう。

In [75]:
def set_estat_params():
  df_cardmng = get_cards_management_dataframe()
  df_cardlist = get_cardliist(df_cardmng)
  
  res = []
  for index, row in df_cardlist.iterrows():
    # indexの取得
    cardId = index[0]
    governmentType = index[2]
    statsDataId = index[3]

    # 都道府県の場合
    if governmentType == 'prefecture':

      areas = set_estat_params_area(governmentType)

      params = []
      for area in areas:
        area['statsDataId'] = statsDataId if type(statsDataId) == 'int' else str(statsDataId).zfill(10)
        area['cdCat01'] = row['categoryCode']
        params.append(area)

      res.append({
          'path' : 'statistics-hyogo/test/{}.json'.format(cardId),
          'params' : params
        })

    # 市区町村の場合
    if governmentType == 'city':

      pref_codes = [d.get('prefCode') for d in get_resas_preflist()]

      for pref_code in pref_codes:
        areas = set_estat_params_area(governmentType,pref_code)
        
        params = []
        for area in areas:
          area['statsDataId'] = statsDataId if type(statsDataId) == 'int' else str(statsDataId).zfill(10)
          area['cdCat01'] = row['categoryCode']
          params.append(area)

        res.append({
            'path' : 'statistics-hyogo/test/{}.json_{}'.format(cardId,pref_code),
            'params' : params
          })

  return res

In [None]:
cards = set_estat_params()

print(cards)

estat-APIのレスポンスを整形してファイルに保存

In [None]:
for card in cards:

  # 保存先path
  path = card['path']

  df_res = pd.DataFrame(index=[], columns=[])
  
  for params in card['params'] :
    res = get_estat_response(params)
    # print(res)
    df = conv_estat_response_to_dataframe(res)
    # print(df)
    df_res = pd.concat([df_res, df])
  # print(df_res)
  df = format_estat_dataframe(df_res)
  
  # df = format_estat_dataframe_withrank(df_res)  
  print(df['timeCode'])


  # jsonで保存
  # res_json = df.reset_index().to_json(path,orient='records',force_ascii=False)

0      1980
1      1980
2      1980
3      1980
4      1980
       ... 
610    2020
611    2020
612    2020
613    2020
614    2020
Name: timeCode, Length: 615, dtype: object
0      1980
1      1980
2      1980
3      1980
4      1980
       ... 
774    2020
775    2020
776    2020
777    2020
778    2020
Name: timeCode, Length: 779, dtype: object
0      1980
1      1980
2      1980
3      1980
4      1980
       ... 
692    2020
693    2020
694    2020
695    2020
696    2020
Name: timeCode, Length: 697, dtype: object
0       1980
1       1980
2       1980
3       1980
4       1980
        ... 
1102    2020
1103    2020
1104    2020
1105    2020
1106    2020
Name: timeCode, Length: 1107, dtype: object
0       1980
1       1980
2       1980
3       1980
4       1980
        ... 
3152    2020
3153    2020
3154    2020
3155    2020
3156    2020
Name: timeCode, Length: 3157, dtype: object
0       1980
1       1980
2       1980
3       1980
4       1980
        ... 
1717    2020
1718    20