<a href="https://colab.research.google.com/github/cwl286/ncode-crawler/blob/main/ncode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# カクヨムから一括ダウンロードをしてepub/mobiを作る

- Create novel epub & mobi from https://ncode.syosetu.com/
  - Mount Google Drive
  - Load htmls from Google Drive (saved in the last time processing if exist) 
  - (Starting from the last episode) or from the 1st episodes
  - Save novel's htmls and parsed htmls into "syosetu" folder in Google Drive
  - Save epub and mobi into "syosetu/epub" and "syosetu/mobi" folder in Google Drive

e.g. https://ncode.syosetu.com/n4698cv/
e.g. ncode: n4698cv
 

In [None]:
##### INPUT AREA
#@title INPUT { run: "auto" }
NCODE="n4444ge" #@param {type:"string"}

In [None]:
##############################
# Mount gdrive 
##############################
from google.colab import drive
drive.mount("/content/gdrive/")

In [None]:
##############################
##### Initialize varibles
##############################
from bs4 import BeautifulSoup
import glob
import os
import subprocess
import requests
import shutil

# Init vars
CHAPTER_BEG = 1
CHAPTER_NUM = CHAPTER_BEG + 1
TITLE = ""
CREATOR = ""

BASE_URL=f"https://ncode.syosetu.com/{NCODE}/"
ORG_DIR=f"{NCODE}_org"
DRIVE_DIR = "syosetu"

EPUB_NAME=f"{TITLE}.epub"
MOBI_NAME=f"{TITLE}.mobi"


# Make colab directories
!mkdir -p $NCODE
!mkdir -p $ORG_DIR

In [None]:
##############################
# Create google drive directories
##############################
try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}")
except Exception as e:
    print(e)

# Create dir for parsed ncodes
try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/{NCODE}")
except Exception as e:
    print(e)

# Create dir for org ncodes
try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/{ORG_DIR}")
except Exception as e:
    print(e)

# Create dir for org ncodes
try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub")
except Exception as e:
    print(e)

# Create dir for org ncodes
try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi")
except Exception as e:
    print(e)

In [None]:
##############################
# Clone google drive files to colab for CHAPTER_BEG
##############################
for html_path in glob.glob(f"/content/gdrive/My Drive/{DRIVE_DIR}/{NCODE}/*.html"):
  shutil.copy(html_path, f"{NCODE}/{os.path.basename(html_path)}")

In [None]:
##############################
##### Download main.html to update TITLE, CREATOR, CHAPTER_NUM
##############################
!curl $BASE_URL > main.html
with open("main.html") as f:
    # query TITLE, CREATOR, CHAPTER_NUM
    text1 = f.read()
    soup1 = BeautifulSoup(text1, 'html.parser')
    TITLE  = str(soup1.title.string)
    CREATOR = str(soup1.find("div", class_="novel_writername").string)
    CHAPTER_NUM= len(soup1.find_all("dl", class_="novel_sublist2"))
os.remove("main.html")

# set variables
CHAPTER_BEG = len(glob.glob(f'{NCODE}/*.html')) + 1
EPUB_NAME=f"{TITLE}.epub"
MOBI_NAME=f"{TITLE}.mobi"

In [None]:
##############################
##### Print parameters
##############################
print(EPUB_NAME)
print(CREATOR)
print([CHAPTER_BEG, CHAPTER_NUM])

In [None]:
for i in range(CHAPTER_BEG, CHAPTER_NUM + 1):
  url = f"{BASE_URL}{i}/"
  print(f"downloading {i}/{CHAPTER_NUM} : {url}")
  file_name = f"{ORG_DIR}/{i:05d}.html" # Move file to _org dir
  !curl $url > $file_name

In [None]:
##############################
##### Parse HTML
##############################
TEMPLATE = """
<html>
  <head>
    <meta charset="UTF-8">
    {0}
  </head>
  <body>
    <h1>{1}</h1>
    {2}
    <hr/>
    {3}
  </body>
</html>
"""

def extract_article(fname):
  with open(f"{ORG_DIR}/{fname}") as f:
    text = f.read()
  with open(f"{NCODE}/{fname}", "w") as f:
    soup = BeautifulSoup(text, 'html.parser')
    f.write(TEMPLATE.format(str(soup.title), 
                            str(soup.find("p", class_="novel_subtitle").string), 
                            str(soup.find(id="novel_honbun")).replace("<br/>", ""), 
                            str(soup.find(id="novel_attention"))
                            )
    )
fnames = [os.path.basename(f) for f in glob.glob(f'{ORG_DIR}/*.html')]
[extract_article(f) for f in fnames]

In [None]:
##############################
##### Convert HTML to epud
##############################
meta1 = f'--metadata=title:"{TITLE}"'
meta2 = f'--metadata=author:"{CREATOR}"' 
meta3 = f'--metadata=lang:"ja"'
html_paths = sorted(glob.glob(f'{NCODE}/*.html'))  

cmd = ['pandoc', '-o', EPUB_NAME, meta1, meta2, meta3]
cmd.extend(html_paths)
subprocess.call(cmd)

In [None]:
##############################
##### Install if needed https://calibre-ebook.com/download_linux
##############################
!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin

In [None]:
##############################
##### Convert epub to mobi
##### Refresh "Files" when done
##############################
cmd = ["ebook-convert",EPUB_NAME,MOBI_NAME]
subprocess.call(cmd) 

In [None]:
##############################
# Copy colab files to google drive
##############################
for html_path in glob.glob(f'{NCODE}/*.html'):
  shutil.copy(html_path, f"/content/gdrive/My Drive/{DRIVE_DIR}/{html_path}")
for html_path in glob.glob(f'{ORG_DIR}/*.html'):
  shutil.copy(html_path, f"/content/gdrive/My Drive/{DRIVE_DIR}/{html_path}")

In [None]:
if os.path.exists(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub/{EPUB_NAME}"):
  os.remove(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub/{EPUB_NAME}") # remove old epub
shutil.copy(EPUB_NAME, f"/content/gdrive/My Drive/{DRIVE_DIR}/epub")

In [None]:
if os.path.exists(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi/{MOBI_NAME}"):
  os.remove(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi/{MOBI_NAME}") # remove old mobi
shutil.copy(MOBI_NAME, f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi")