# 実行環境の作成

## Google Drive

Google Driveをマウント

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Google Cloud Storage

下記コードでGCPに接続

In [2]:
from google.colab import auth
auth.authenticate_user()

認証に成功したらgcsfuseをインストール

In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt-get -y -q update
!apt-get -y -q install gcsfuse

バケット「statistics-hyogo」をディレクトリ「statistics-hyogo」にマウント

In [None]:
! mkdir -p statistics-hyogo
! gcsfuse --implicit-dirs --limit-bytes-per-sec -1 --limit-ops-per-sec -1 statistics-hyogo statistics-hyogo

## Google Sheets

ライブラリのインストール

In [5]:
!pip install --upgrade -q gspread

[?25l[K     |████████                        | 10 kB 20.2 MB/s eta 0:00:01[K     |████████████████▏               | 20 kB 16.4 MB/s eta 0:00:01[K     |████████████████████████▎       | 30 kB 10.6 MB/s eta 0:00:01[K     |████████████████████████████████| 40 kB 3.4 MB/s 
[?25h

GoogleSheetsの認証

In [6]:
from google.colab import auth
from google.auth import default
import gspread

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# estatの統計情報を取得

メタ情報を取得する関数を作成

In [16]:
import urllib.parse
import urllib.request
import json

def get_estat_meta(statsDataId):
  params ={
      'statsDataId': statsDataId,
      'appId': '724e5b90772a3e9289f41a253e4e7e32438f4fff',
  }

  url = 'http://api.e-stat.go.jp/rest/3.0/app/json/getMetaInfo?'
  url += urllib.parse.urlencode(params)

  with urllib.request.urlopen(url) as response:
    return json.loads(response.read().decode('utf-8'))

In [None]:
meta = get_estat_meta('0000020211')
meta

## DataFrameの作成

メタ情報をDataFrameに整形する

In [17]:
import pandas as pd

def get_cards_dataframe(statsDataId):
  # メタ情報の取得
  meta= get_estat_meta(statsDataId)

  # CLASS_OBJ
  CLASS_OBJ = meta['GET_META_INFO']['METADATA_INF']['CLASS_INF']['CLASS_OBJ']
  cat01 = next((d for d in CLASS_OBJ if d['@id'] == 'cat01'), None)['CLASS']

  # DataFrame
  df = pd.json_normalize(cat01)[['@code','@name']]
  
  # 統計表情報の追加
  df['statsDataId'] = statsDataId
  df['statsDataName'] = meta['GET_META_INFO']['METADATA_INF']['TABLE_INF']['STAT_NAME']['$']

  # 列名の変更
  columns = {'@code':'categoryCode', '@name':'categoryName', '@unit':'unit'}
  df= df.rename(columns=columns)

  # categoryNameから不要な情報（categoryCode）を削除
  df['categoryName'] = df.apply(lambda x: x['categoryName'].replace(x['categoryCode']+'_', ''), 1)
  
  # 並べ替え
  df = df.reindex(columns=['statsDataId', 'statsDataName', 'categoryCode','categoryName','unit'])
  
  return df

In [18]:
df = get_cards_dataframe('0000020211')
df

Unnamed: 0,statsDataId,statsDataName,categoryCode,categoryName,unit
0,20211,社会・人口統計体系,K3101,交通事故発生件数,
1,20211,社会・人口統計体系,K4201,刑法犯認知件数,


## 統計カード管理情報をSpreadSheetで加工

統計カードの管理情報を SpreadSheetで管理する

前セルで作業したDataFrameをcsvでコピーするなどして作業すること


[statistics-hyogo/統計カード管理](https://docs.google.com/spreadsheets/d/1mAv2gx9khNOty_ILu3aJjJz0uoRkARzwoaG4VBUSNG4/edit#gid=0)

# コンテンツ作成

## jsonを保存する関数

In [None]:
def save_json(dic,path):
  
  # Google Driveへ保存
  drive_path = "drive/MyDrive/statistics-hyogo/{}".format(path)
  
  with open(drive_path, 'w') as f:
    json.dump(dic,f,ensure_ascii=False)
  
  # GoogleCloudStorage
  gcs_path = "statistics-hyogo/{}".format(path)
  with open(gcs_path, 'w') as f:
    json.dump(dic,f,ensure_ascii=False)

## 統計分野一覧

GoogleSheetsから統計分野一覧を取得

In [None]:
def get_statistics_fields():

  # シート情報の設定
  id = "1mAv2gx9khNOty_ILu3aJjJz0uoRkARzwoaG4VBUSNG4"
  sheet_name = 'statisticsFields'
  worksheet = gc.open_by_key(id).worksheet(sheet_name)
  
  return worksheet.get_all_records()

In [None]:
statistics_fields = get_statistics_fields()
print(statistics_fields)

[{'index': 1, 'fieldId': 'landweather', 'fieldTitle': '国土・気象', 'icon': 'WbSunny'}, {'index': 2, 'fieldId': 'population', 'fieldTitle': '人口・世帯', 'icon': 'Group'}, {'index': 3, 'fieldId': 'laborwage', 'fieldTitle': '労働・賃金', 'icon': 'Payments'}, {'index': 4, 'fieldId': 'agriculture', 'fieldTitle': '農林水産業', 'icon': 'Agriculture'}, {'index': 5, 'fieldId': 'miningindustry', 'fieldTitle': '鉱工業', 'icon': 'Factory'}, {'index': 6, 'fieldId': 'commercial', 'fieldTitle': '商業・サービス業', 'icon': 'Store'}, {'index': 7, 'fieldId': 'economy', 'fieldTitle': '企業・家計・経済', 'icon': 'LocationCity'}, {'index': 8, 'fieldId': 'construction', 'fieldTitle': '住宅・土地・建設', 'icon': 'HomeWork'}, {'index': 9, 'fieldId': 'energy', 'fieldTitle': 'エネルギー・水', 'icon': 'LocalDrink'}, {'index': 10, 'fieldId': 'tourism', 'fieldTitle': '運輸・観光', 'icon': 'LocalShipping'}, {'index': 11, 'fieldId': 'educationsports', 'fieldTitle': '教育・文化・スポーツ', 'icon': 'School'}, {'index': 12, 'fieldId': 'administrativefinancial', 'fieldTitle': '行財政', 'i

GoogleDriveとGoogleCloudStorageに保存

In [None]:
dic = get_statistics_fields()
path = "contents/statistics-fields.json"

save_json(dic,path)

## 統計項目一覧

GoogleSheetsから統計項目一覧を取得

In [None]:
def get_statistics_menus():

  # シート情報の設定
  id = "1mAv2gx9khNOty_ILu3aJjJz0uoRkARzwoaG4VBUSNG4"
  sheet_name = 'statisticsMenus'
  worksheet = gc.open_by_key(id).worksheet(sheet_name)
  
  return worksheet.get_all_records()

In [None]:
statistics_menus = get_statistics_menus()
print(statistics_menus)

[{'index': 11, 'menuId': 'area', 'menuTitle': '面積', 'fieldId': 'landweather', 'fieldTitle': '国土・気象'}, {'index': 12, 'menuId': 'climate', 'menuTitle': '気候', 'fieldId': 'landweather', 'fieldTitle': '国土・気象'}, {'index': 13, 'menuId': 'park', 'menuTitle': '自然・公園', 'fieldId': 'landweather', 'fieldTitle': '国土・気象'}, {'index': 21, 'menuId': 'population', 'menuTitle': '人口', 'fieldId': 'population', 'fieldTitle': '人口・世帯'}, {'index': 22, 'menuId': 'household', 'menuTitle': '世帯', 'fieldId': 'population', 'fieldTitle': '人口・世帯'}, {'index': 23, 'menuId': 'marriage', 'menuTitle': '婚姻', 'fieldId': 'population', 'fieldTitle': '人口・世帯'}, {'index': 24, 'menuId': 'birth', 'menuTitle': '出生', 'fieldId': 'population', 'fieldTitle': '人口・世帯'}, {'index': 25, 'menuId': 'death', 'menuTitle': '死亡', 'fieldId': 'population', 'fieldTitle': '人口・世帯'}, {'index': 26, 'menuId': 'inflowoutflow', 'menuTitle': '流入・流出', 'fieldId': 'population', 'fieldTitle': '人口・世帯'}, {'index': 27, 'menuId': 'moveinmoveout', 'menuTitle': '転入・転出'

GoogleDriveとGoogleCloudStorageに保存

In [None]:
dic = get_statistics_menus()
path = "contents/statistics-menus.json"

save_json(dic,path)

## 統計カード一覧

GoogleSheetsから統計カード一覧を取得してDataFrameに変換

In [None]:
import numpy as np

def get_statistics_cards():

  # シート情報の設定
  id = "1mAv2gx9khNOty_ILu3aJjJz0uoRkARzwoaG4VBUSNG4"
  sheet_name = 'statisticsCards'
  worksheet = gc.open_by_key(id).worksheet(sheet_name)

  # データを取得してDataFrameに格納
  dic = worksheet.get_all_records()
  df = pd.DataFrame(dic)

  # 空白行を削除
  df['cardId'].replace('', np.nan, inplace=True)
  df.dropna(subset=['cardId'], inplace=True)

  # カテゴリ情報
  df_categories = set_categories(df)

  # estat-APIのパラメータ情報
  df_estatParams = set_estat_params(df)

  # 重複を排除して必要な列だけ抽出
  df_res = df.drop_duplicates(subset='cardId')
  df_res = df_res[['cardId','cardTitle','cardIndex','governmentType','menuId','menuTitle','fieldId','fieldTitle','chartComponent']]

  # マージ
  df_res = pd.merge(df_res, df_categories, on='cardId', how='outer')
  df_res = pd.merge(df_res, df_estatParams, on='cardId', how='outer')
  
  return df_res

"""
カテゴリ情報をセットする関数
"""
def set_categories(df_arg):
  df_res = df_arg.copy()

  # オブジェクト作成
  def func(row):
    return {
        'categoryCode' : row.categoryCode,
        'categoryName' : row.categoryName,
        'isSelect' : row.isSelect,
        'type' : row.type,
        'yAxis' : row.yAxis
    }
  df_res['categories'] = df_res.apply(func, 1)
  
  # cardIdでグループ化
  df_res = df_res.groupby('cardId')['categories'].apply(list).reset_index()
  
  return df_res 

"""
estat-APIのパラメータをセットする関数
"""
def set_estat_params(df_arg):
  # statsDataId
  df_statsDataId = df_arg.drop_duplicates(subset='cardId').loc[:,['cardId','statsDataId']]
  df_statsDataId['statsDataId'] = df_statsDataId.apply(lambda x: str(x['statsDataId']).zfill(10), 1)
  
  # cdCat01
  df_cat01 = df_arg[['cardId','categoryCode']] \
             .rename(columns={'categoryCode':'cdCat01'})  \
             .groupby(['cardId']).agg({'cdCat01': lambda x: ','.join(x)}).reset_index()

  # オブジェクトを'estatParams'に格納
  df_res = pd.merge(df_statsDataId, df_cat01, on='cardId', how='outer')
  def func(row):
   return {
        'statsDataId' : row.statsDataId,
        'cdCat01' : row.cdCat01,
    }
  df_res['estatParams'] = df_res.apply(func, 1)
  

  return df_res[['cardId','estatParams']]


In [None]:
df_statistics_cards = get_statistics_cards()
df_statistics_cards

GoogleDriveとGoogleCloudStorageに保存

In [None]:
def save_json(df,path):
  
  # Google Driveへ保存
  drive_path = "drive/MyDrive/statistics-hyogo/{}".format(path)
  df.reset_index().to_json(drive_path,orient='records',force_ascii=False)
  
  # GoogleCloudStorage
  gcs_path = "statistics-hyogo/{}".format(path)
  df.reset_index().to_json(gcs_path,orient='records',force_ascii=False)

In [None]:
df = get_statistics_cards()
path = "contents/statistics-cards.json"

save_json(df,path)

## routes

都道府県一覧

In [None]:
def get_preflist():
  with open('drive/MyDrive/statistics-hyogo/resas/preflist.json') as f:
    return json.load(f)

市区町村一覧

In [None]:
def get_citylist():
  with open('drive/MyDrive/statistics-hyogo/resas/citylist.json') as f:
    return json.load(f)

print(get_citylist())

[{'prefCode': 1, 'cityCode': '01100', 'cityName': '札幌市', 'bigCityFlag': '2'}, {'prefCode': 1, 'cityCode': '01101', 'cityName': '札幌市中央区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01102', 'cityName': '札幌市北区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01103', 'cityName': '札幌市東区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01104', 'cityName': '札幌市白石区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01105', 'cityName': '札幌市豊平区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01106', 'cityName': '札幌市南区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01107', 'cityName': '札幌市西区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01108', 'cityName': '札幌市厚別区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01109', 'cityName': '札幌市手稲区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01110', 'cityName': '札幌市清田区', 'bigCityFlag': '1'}, {'prefCode': 1, 'cityCode': '01202', 'cityName': '函館市', 'bigCityFlag': '0'}, {'prefCode': 1, 'cityCode': '01203', 'cityName': 

統計カード一覧

In [None]:
def get_statistics_cards():
  with open('drive/MyDrive/statistics-hyogo/contents/statistics-cards.json') as f:
    return json.load(f)

print(get_statistics_cards())

[{'index': 0, 'cardId': 'total-population-prefecture', 'cardTitle': '都道府県の総人口', 'cardIndex': 1, 'governmentType': 'prefecture', 'menuId': 'population', 'menuTitle': '人口', 'fieldId': 'population', 'fieldTitle': '人口・世帯', 'chartComponent': 'TimeChart', 'categories': [{'categoryCode': 'A1101', 'categoryName': '総人口', 'isSelect': 'TRUE', 'type': 'column', 'yAxis': 0}, {'categoryCode': 'A110101', 'categoryName': '総人口（男）', 'isSelect': '', 'type': 'column', 'yAxis': 0}, {'categoryCode': 'A110102', 'categoryName': '総人口（女）', 'isSelect': '', 'type': 'column', 'yAxis': 0}], 'estatParams': {'statsDataId': '0000010101', 'cdCat01': 'A1101,A110101,A110102'}}, {'index': 1, 'cardId': 'japanese-population-prefecture', 'cardTitle': '都道府県の日本人人口', 'cardIndex': '', 'governmentType': 'prefecture', 'menuId': 'population', 'menuTitle': '人口', 'fieldId': 'population', 'fieldTitle': '人口・世帯', 'chartComponent': 'TimeChart', 'categories': [{'categoryCode': 'A1102', 'categoryName': '日本人人口', 'isSelect': 'TRUE', 'type': 

routes一覧を作成

In [None]:
def generate_routes(cards):

  res = []
  for card in cards:
    
    # 値の取得
    governmentType = card['governmentType']
    fieldId = card['fieldId']
    menuId = card['menuId']
    cardId = card['cardId']
  
    # 地域コード一覧作成
    codes = []
    if governmentType == 'prefecture':
      codes = [d.get('prefCode') for d in get_preflist()]
      codes = [f'{n:02}'+'000' for n in codes]
    elif governmentType == 'city':
      codes = [d.get('cityCode') for d in get_citylist()]

    # routes作成
    for code in codes:
      route = '/{}/{}/{}/{}/{}/'.format(governmentType,code,fieldId,menuId,cardId)
      res.append(route)

  return res


In [None]:
statistics_cards = get_statistics_cards()
routes = generate_routes(statistics_cards)
print(routes)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



GoogleDriveとGoogleCloudStorageに保存

In [None]:
def save_json(dic,path):
  
  # Google Driveへ保存
  drive_path = "drive/MyDrive/statistics-hyogo/{}".format(path)
  
  with open(drive_path, 'w') as f:
    json.dump(dic,f,ensure_ascii=False)
  
  # GoogleCloudStorage
  gcs_path = "statistics-hyogo/{}".format(path)
  with open(gcs_path, 'w') as f:
    json.dump(dic,f,ensure_ascii=False)

In [None]:
routes = generate_routes(statistics_cards)
path = "routes/routes.json"

save_json(routes,path)

## sitemap.xml

サイトマップに反映させるroutesは兵庫県だけ

In [None]:
def generate_routes_sitemap(cards):

  res = []
  for card in cards:
    
    # 値の取得
    governmentType = card['governmentType']
    fieldId = card['fieldId']
    menuId = card['menuId']
    cardId = card['cardId']

    # 地域コード一覧作成
    codes = []
    if governmentType == 'prefecture':
      codes = ['28000']
    elif governmentType == 'city':
      citylist = list(filter(lambda x: x['prefCode'] == 28, get_citylist()))
      codes = [d.get('cityCode') for d in citylist]

    # routes作成
    for code in codes:
      route = '/{}/{}/{}/{}/{}/'.format(governmentType,code,fieldId,menuId,cardId)
      res.append(route)

  return res

In [None]:
statistics_cards = get_statistics_cards()
routes_sitemap = generate_routes_sitemap(statistics_cards)
print(routes_sitemap)

['/prefecture/28000/population/population/total-population-prefecture/', '/prefecture/28000/population/marriage/unmarried-prefecture/', '/prefecture/28000/population/marriage/spouse-prefecture/', '/prefecture/28000/population/marriage/separated-prefecture/', '/prefecture/28000/population/death/standardized-mortality-prefecture/', '/prefecture/28000/population/household/household-prefecture/', '/prefecture/28000/population/household/single-mother-household-prefecture/', '/prefecture/28000/population/marriage/first-marriage-age-prefecture/', '/prefecture/28000/population/marriage/divorce-prefecture/', '/city/28100/population/population/total-population-city/', '/city/28101/population/population/total-population-city/', '/city/28102/population/population/total-population-city/', '/city/28105/population/population/total-population-city/', '/city/28106/population/population/total-population-city/', '/city/28107/population/population/total-population-city/', '/city/28108/population/populatio

sitemap.xmlを保存

In [None]:
import xml.etree.ElementTree as ET
import datetime
import tempfile

def generate_sitemap(path):

    urlset = ET.Element('urlset')
    urlset.set('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9')
    urlset.set('xmlns:news', 'http://www.google.com/schemas/sitemap-news/0.9')
    urlset.set('xmlns:xhtml', 'http://www.w3.org/1999/xhtml')
    urlset.set('xmlns:mobile', 'http://www.google.com/schemas/sitemap-mobile/1.0')
    urlset.set('xmlns:image', 'http://www.google.com/schemas/sitemap-image/1.1')
    urlset.set('xmlns:video', 'http://www.google.com/schemas/sitemap-video/1.1')
    tree = ET.ElementTree(element=urlset)

    for route in routes():
        url = 'https://statistics-hyogo.com'+route
        updated = datetime.date.today()
        updated = updated.strftime('%Y-%m-%d')
        url_element = ET.SubElement(urlset, 'url')
        loc = ET.SubElement(url_element, 'loc')
        loc.text = url
        lastmod = ET.SubElement(url_element, 'lastmod')
        lastmod.text = updated
        
    _, temp_local_filename = tempfile.mkstemp()
    tree.write(temp_local_filename, encoding='utf-8', xml_declaration=True)
        
        
    # bucket = storage_client.bucket('statistics-hyogo')
    # blob = bucket.blob('sitemap.xml')
    # blob.upload_from_filename(temp_local_filename)
    # os.remove(temp_local_filename)

    return 'sitemap.xml generated !!'

# BigQueryへ保存

ライブラリのインストール

In [None]:
!pip install --upgrade -q gspread

SpreadSheetの認証

In [None]:
from google.colab import auth
from google.auth import default
import gspread

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

worksheetを読み込んでDataFrameに変換

In [None]:
# シート情報の設定
id = "1mAv2gx9khNOty_ILu3aJjJz0uoRkARzwoaG4VBUSNG4"
sheet_name = 'statisticsCards'
worksheet = gc.open_by_key(id).worksheet(sheet_name)

# DataFrameに格納
dic = worksheet.get_all_records(head=1)
df = pd.DataFrame(dic)
df

Unnamed: 0,index,statsDataId,statsDataName,categoryCode,categoryName,governmentType,cardIndex,cardId,cardTitle,chartComponent,isSelect,type,yAxis,menuId,menuTitle,fieldId,fieldTitle
0,TRUE,10101,社会・人口統計体系,A1101,総人口,prefecture,1,total-population-prefecture,都道府県の総人口,TimeChart,TRUE,column,0,population,人口,population,人口・世帯
1,TRUE,10101,社会・人口統計体系,A110101,総人口（男）,prefecture,1,total-population-prefecture,都道府県の総人口,TimeChart,,column,0,population,人口,population,人口・世帯
2,TRUE,10101,社会・人口統計体系,A110102,総人口（女）,prefecture,1,total-population-prefecture,都道府県の総人口,TimeChart,,column,0,population,人口,population,人口・世帯
3,TRUE,10101,社会・人口統計体系,A1102,日本人人口,prefecture,,japanese-population-prefecture,都道府県の日本人人口,TimeChart,TRUE,column,0,population,人口,population,人口・世帯
4,TRUE,10101,社会・人口統計体系,A110201,日本人人口（男）,prefecture,,japanese-population-prefecture,都道府県の日本人人口,TimeChart,,column,0,population,人口,population,人口・世帯
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5252,TRUE,20210,社会・人口統計体系,J4107,国民健康保険被保険者1人当たり診療費,city,,national-health-insurance-city,市区町村の国民健康保険被保険者数,TimeChart,,column,1,medical,医療,socialsecurity,医療・社会保障・衛生
5253,TRUE,20210,社会・人口統計体系,J4108,国民健康保険給付金額,city,,national-health-insurance-benefit-amount-city,市区町村の国民健康保険給付金額,TimeChart,TRUE,column,1,medical,医療,socialsecurity,医療・社会保障・衛生
5254,TRUE,20210,社会・人口統計体系,J4109,国民健康保険医療費金額（診療費）,city,,,,,,,,,,,
5255,TRUE,20211,社会・人口統計体系,K3101,交通事故発生件数,city,,incident-city,市区町村の交通事故事件件数,TimeChart,TRUE,line,0,traffic-accident,交通事故,safetyenvironment,司法・安全・環境


pandas-gbqのインストール

In [None]:
!pip install pandas-gbq

DataFrameをBigQueryへ保存

In [None]:
import pandas as pd

# プロジェクトの定義
project_id='primal-buttress-342908'
dataset_id = 'contents'
table_id='cards'

# データフレームの内容をBigQueryのテーブルに追加
df.to_gbq( f'{dataset_id}.{table_id}', project_id=project_id, if_exists="replace")

ArrowTypeError: ignored