In [None]:
# 必要なモジュールをインストール
!pip install lxml cssselect

In [6]:
import os
import re
import codecs
import requests
import urllib
from urllib.request import urlopen
import lxml.html
import zipfile
 
import numpy as np
import pandas as pd
 
from google.colab import auth
 
ROOT_DIR = '.' + os.path.sep + 'aozora' + os.path.sep
ZIP_DIR = ROOT_DIR + 'zip' + os.path.sep
TEXT_SJIS_DIR = ROOT_DIR + 'text_sjis' + os.path.sep
TEXT_UTF8_DIR = ROOT_DIR + 'text_utf8' + os.path.sep
GS_DIR = 'gs://バケット名/aozora_dataset/'
DATA_LIST_FILENAME = 'data_list.csv'
 
base_url = 'http://www.aozora.gr.jp/'
 
author_dict = {
    'author' : [
                'NatsumeSoseki', 
                'DazaiOsamu', 
                'MiyazawaKenji',
                'AkutagawaRyunosuke',
                'YumenoKyusaku',
                'FukuzawaYukichi'
                ], 
    'url' : [
             'http://www.aozora.gr.jp/index_pages/person148.html#sakuhin_list_1',
             'https://www.aozora.gr.jp/index_pages/person35.html#sakuhin_list_1',
             'https://www.aozora.gr.jp/index_pages/person81.html#sakuhin_list_1',
             'https://www.aozora.gr.jp/index_pages/person879.html#sakuhin_list_1',
             'https://www.aozora.gr.jp/index_pages/person96.html#sakuhin_list_1',
             'https://www.aozora.gr.jp/index_pages/person296.html#sakuhin_list_1'
            ],
    'download_num' : [
                      12, 
                      23, 
                      27,
                      20,
                      8,
                      35
                      ],
    'exclusion_list' : [
        ['こころ', '永日小品'],
        ['走れメロス'],
        ['銀河鉄道の夜', '〔青びかる天弧のはてに〕', '青柳教諭を送る', '〔あくたうかべる朝の水〕'],
        ['羅生門'],
        ['ドグラ・マグラ'],
        ['学問のすすめ']
    ]
}
author_df = pd.DataFrame(author_dict)
 
 
def download_zip(author_df):
    for index, row in author_df.iterrows():
        zip_dict = {}
 
        url = row['url']
        resp_root = requests.get(url)
        html_root = lxml.html.fromstring(resp_root.content)
        html_root.make_links_absolute(resp_root.url)
 
        print('author : {}'.format(row['author']))
        print('url : {}'.format(url))
 
        download_count = 0
        for a in html_root.cssselect('a'):
            if -1 < row['download_num'] and row['download_num'] <= download_count:
                break
 
            link = a.get('href')
            if link is not None and 'cards' in link:
                resp_card = requests.get(link)
                html_card = lxml.html.fromstring(resp_card.content)
                html_card.make_links_absolute(resp_card.url)
                link_zip_list = html_card.xpath('//a[contains(@href, ".zip")]')
                if 0 < len(link_zip_list):
                    link_zip = link_zip_list[0]
                    zip_filename = link_zip.text.split('.')[-2]
                    zip_link = link_zip.get('href')
                    print('[{}],{},{}'. format(a.text, zip_filename, zip_link))
 
                    # 使用しないファイルをスキップ
                    if a.text in row['exclusion_list']:
                        print('    {} is excluded'.format(a.text))
                        continue
 
                    zip_dict[zip_filename] = zip_link
                    download_count += 1
 
        download_path = ZIP_DIR + row['author'] + os.path.sep
        if not os.path.exists(download_path):
            os.mkdir(download_path)
        for i, key in enumerate(zip_dict):
            print('download[{}/{}]...    {} : {}'.format(i+1, len(zip_dict), key, zip_dict[key]))
            urllib.request.urlretrieve(zip_dict[key], download_path + os.path.basename(zip_dict[key]))
 
 
def extract_zip(author_df):
    for index, row in author_df.iterrows():
        author_zip_dir = ZIP_DIR + row['author'] + os.path.sep
        author_text_sjis_dir = TEXT_SJIS_DIR + row['author'] + os.path.sep
        if not os.path.exists(author_text_sjis_dir):
            os.mkdir(author_text_sjis_dir)
 
        files = os.listdir(author_zip_dir)
        for file in files:
            root, ext = os.path.splitext(file)
            if ext == '.zip':
                with zipfile.ZipFile(author_zip_dir + file, 'r') as zf:
                    zf.extractall(path=author_text_sjis_dir)
                    print('extract {} to {}'.format(author_zip_dir + file, author_text_sjis_dir))
 
 
def prepare_data(author_df):
    re_ruby = re.compile('\《.+?\》')
    re_note = re.compile('\［＃.+?\］')
    for index, row in author_df.iterrows():
        author_text_sjis_dir = TEXT_SJIS_DIR + row['author'] + os.path.sep
 
        files = os.listdir(author_text_sjis_dir)
        for file in files:
            root, ext = os.path.splitext(file)
            if ext == '.txt':
 
                sjis_filename = author_text_sjis_dir + file
                fsjis = codecs.open(sjis_filename, 'r', 'shift_jis')
                blank_line_count = 0
                paragraph_index = 0
                is_header = False
                for i, line in enumerate(fsjis):
                    line = line.strip()
 
                    # 空行はスキップ
                    if len(line) == 0:
                        blank_line_count += 1
                        continue
 
                    # ヘッダはスキップ
                    if line.startswith('--------------------'):
                        is_header = not is_header
                        continue
                    if is_header:
                        continue
 
                    # フッタはスキップ
                    if 3 <= blank_line_count:
                        if line.startswith('底本：'):
                            break
                    blank_line_count = 0
 
                    # ルビ、注釈は削除
                    edited_line = re_note.sub('', re_ruby.sub('', line.replace('\r', '')))
                    if len(edited_line) == 0:
                        continue
 
                    # 段落ごとに utf8 のテキストファイルを作成
                    paragraph_index += 1
                    utf8_filename = '{}{}_{}_{}.txt'.format(TEXT_UTF8_DIR, row['author'], root, str(paragraph_index).zfill(5))
                    futf8 = codecs.open(utf8_filename, 'w', 'utf-8')
                    futf8.write(''.join(edited_line))
                    futf8.close()
 
                print('{} : {}'.format(sjis_filename, paragraph_index))
                fsjis.close()
 
 
def create_data_list(filename):
    file_list = []
    label_list = []
 
    text_files = os.listdir(TEXT_UTF8_DIR)
    for file in text_files:
        root, ext = os.path.splitext(file)
        file_list.append(GS_DIR + 'text/' + file)
        label_list.append(root.split('_')[0])
 
    data_dict = {'file_url': file_list, 'label': label_list}
    data_df = pd.DataFrame(data_dict)
    print('data_df : {}'.format(data_df))
    data_df.to_csv(ROOT_DIR + filename, index=False, header=False)


In [None]:
# 必要なディレクトリを作成
os.makedirs(ZIP_DIR)
os.makedirs(TEXT_SJIS_DIR)
os.makedirs(TEXT_UTF8_DIR)
 
print()
print('# 青空文庫からテキストデータの zip ファイルをダウンロード ####################')
print()
download_zip(author_df)
 
print()
print('# ダウンロードした zip ファイルを解凍 ##################################')
print()
extract_zip(author_df)
 
print()
print('# 解凍したテキストデータを前処理 ####################################')
print()
prepare_data(author_df)
 
print()
print('# 学習で使用するデータリストファイルを作成 #############################')
print()
create_data_list(DATA_LIST_FILENAME)


In [7]:
# auth.authenticate_user() で GCP への認証を行う
auth.authenticate_user()


In [None]:
# テキストデータファイルを GCS へアップロード
!gsutil -m cp aozora/text_utf8/* gs://バケット名/aozora_dataset/text/
 
# データリストファイルを GCS へアップロード
!gsutil -m cp aozora/data_list.csv gs://バケット名/aozora_dataset/
