<a href="https://colab.research.google.com/github/cwl286/Javascript/blob/main/kakuyomu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# カクヨムから一括ダウンロードをしてepub/mobiを作る
- Create novel epub & mobi from https://kakuyomu.jp/explore
  - Mount Google Drive
  - Load htmls from Google Drive (saved in the last time processing if exist) 
  - (Starting from the last episode) or from the 1st episodes
  - Save novel's htmls and parsed htmls into "syosetu" folder in Google Drive
  - Save epub and mobi into "syosetu/epub" and "syosetu/mobi" folder in Google Drive

e.g. https://kakuyomu.jp/works/16816452219449457673  
e.g. KAKUYOMU: 16816452219449457673
 

In [1]:
##### INPUT AREA
#@title INPUT { run: "auto" }
KAKUYOMU="16816452219449457673" #@param {type:"string"}

In [2]:
##############################
# Mount gdrive 
##############################
from google.colab import drive
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


In [3]:
##############################
##### Initialize varibles
##############################
from bs4 import BeautifulSoup
import glob
import os
import subprocess
import requests
import shutil

# Init variables
BASE_URL= f"https://kakuyomu.jp/works/{KAKUYOMU}"
ORG_DIR= f"{KAKUYOMU}_org"
DRIVE_DIR = "syosetu"
EPISODES_URLs = []
CHAPTER_BEG = 0
CHAPTER_NUM = 1
TITLE = ""
CREATOR = ""
EPUB_NAME=f"{TITLE}.epub"
MOBI_NAME=f"{TITLE}.mobi"

# Make colab directories
!mkdir -p $KAKUYOMU
!mkdir -p $ORG_DIR

In [4]:
##############################
# Create google drive directories
##############################
try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}")
except Exception as e:
    print(e)

try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/{KAKUYOMU}")
except Exception as e:
    print(e)

try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/{ORG_DIR}")
except Exception as e:
    print(e)

try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub")
except Exception as e:
    print(e)

try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi")
except Exception as e:
    print(e)

[Errno 17] File exists: '/content/gdrive/My Drive/syosetu'
[Errno 17] File exists: '/content/gdrive/My Drive/syosetu/epub'
[Errno 17] File exists: '/content/gdrive/My Drive/syosetu/mobi'


In [5]:
##############################
# Clone google drive files to colab 
##############################
for html_path in glob.glob(f"/content/gdrive/My Drive/{DRIVE_DIR}/{KAKUYOMU}/*.html"):
  shutil.copy(html_path, f"{KAKUYOMU}/{os.path.basename(html_path)}")

In [6]:
##############################
##### Download main.html to update TITLE, CREATOR, CHAPTER_NUM
##############################
# Download query TITLE, CREATOR, CHAPTER_NUM

!curl $BASE_URL > main.html
with open("main.html") as f:
    text1 = f.read()
    soup1 = BeautifulSoup(text1, 'html.parser')
    TITLE  = str(soup1.find(id="workTitle").string)
    CREATOR  = str(soup1.find(id="workAuthor-activityName").string)
    EPISODES_URLs = soup1.find_all("a", class_="widget-toc-episode-episodeTitle")
os.remove("main.html")

# update paramter
CHAPTER_BEG = len(glob.glob(f'{KAKUYOMU}/*.html'))
CHAPTER_NUM = len(EPISODES_URLs)
EPUB_NAME=f"{TITLE}.epub"
MOBI_NAME=f"{TITLE}.mobi"


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  116k  100  116k    0     0   100k      0  0:00:01  0:00:01 --:--:--  100k


In [7]:
##############################
##### Print parameters 
##############################
print(TITLE)
print(CREATOR)
print([CHAPTER_BEG, CHAPTER_NUM])

世界でただ一人の魔物使い～転職したら魔王に間違われました～
筧千里
[0, 147]


In [8]:
##############################
##### Download HTMLs
##############################
for i in range(CHAPTER_BEG, CHAPTER_NUM):
  basename = os.path.basename(EPISODES_URLs[i].get("href"))
  url = f"{BASE_URL}/episodes/{basename}"
  file_name = f"{ORG_DIR}/{i+1:05d}.html"
  print(f"downloading: {i}/{CHAPTER_NUM}: {url}")
  !curl $url > $file_name

downloading: 0/147: https://kakuyomu.jp/works/16816452219449457673/episodes/16816452219449641762
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 52857    0 52857    0     0   135k      0 --:--:-- --:--:-- --:--:--  135k
downloading: 1/147: https://kakuyomu.jp/works/16816452219449457673/episodes/16816452219449648393
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 51254    0 51254    0     0   135k      0 --:--:-- --:--:-- --:--:--  134k
downloading: 2/147: https://kakuyomu.jp/works/16816452219449457673/episodes/16816452219449655165
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 51694    0 51694    0     0  88821      0 --:--:-- --:--:-- --:--:-- 8882

In [9]:
##############################
##### Parse HTML
##############################
TEMPLATE = """
<html>
  <head>
    <meta charset="UTF-8">
    {0}
  </head>
  <body>
    <h1>{1}</h1>
    {2}
    <hr/>
  </body>
</html>
"""

def extract_article(fname):
  with open(f"{ORG_DIR}/{fname}") as f:
    text = f.read()
  with open(f"{KAKUYOMU}/{fname}", "w") as f:
    soup = BeautifulSoup(text, 'html.parser')
    chapterTitle = str(soup.title)
    episodeTitle = soup.find("p", class_="widget-episodeTitle js-vertical-composition-item")
    episodeTitle = "" if episodeTitle is None else str(episodeTitle.string)
    content = str(soup.find("div", class_="widget-episodeBody js-episode-body")).replace("<br/>", "")
    f.write(TEMPLATE.format(chapterTitle, episodeTitle, content ))
   

fnames = [os.path.basename(f) for f in glob.glob(f'{ORG_DIR}/*.html')]
[extract_article(f) for f in fnames]
html_paths = sorted(glob.glob(f'{KAKUYOMU}/*.html'))  

In [10]:
##############################
##### Convert HTML to epud
##############################
meta1 = f'--metadata=title:"{TITLE}"'
meta2 = f'--metadata=author:"{CREATOR}"' 
meta3 = f'--metadata=lang:"ja"'

cmd = ['pandoc', '-o', EPUB_NAME, meta1, meta2, meta3]
cmd.extend(html_paths)
subprocess.call(cmd)

0

In [None]:
##############################
##### Install  https://calibre-ebook.com/download_linux
##############################
!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin

In [12]:
##############################
##### Convert epub to mobi
##### Refresh "Files" when done
##############################
cmd = ["ebook-convert",EPUB_NAME,MOBI_NAME]
subprocess.call(cmd) 

0

In [13]:
##############################
# Copy colab files to google drive
##############################
for html_path in glob.glob(f'{KAKUYOMU}/*.html'):
  shutil.copy(html_path, f"/content/gdrive/My Drive/{DRIVE_DIR}/{html_path}")
for html_path in glob.glob(f'{ORG_DIR}/*.html'):
  shutil.copy(html_path, f"/content/gdrive/My Drive/{DRIVE_DIR}/{html_path}")

In [14]:
if os.path.exists(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub/{EPUB_NAME}"):
  os.remove(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub/{EPUB_NAME}") # remove old epub
shutil.copy(EPUB_NAME, f"/content/gdrive/My Drive/{DRIVE_DIR}/epub")

'/content/gdrive/My Drive/syosetu/epub/世界でただ一人の魔物使い～転職したら魔王に間違われました～.epub'

In [15]:
if os.path.exists(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi/{MOBI_NAME}"):
  os.remove(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi/{MOBI_NAME}") # remove old mobi
shutil.copy(MOBI_NAME, f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi")

'/content/gdrive/My Drive/syosetu/mobi/世界でただ一人の魔物使い～転職したら魔王に間違われました～.mobi'