<a href="https://colab.research.google.com/github/cwl286/wenku-crawler/blob/main/wenku.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

一括ダウンロードをしてepub/mobiを作る

- Create novel epub & mobi from https://www.wenku8.net/index.php

e.g. https://www.wenku8.net/novel/1/1715/ or https://www.wenku8.net/novel/1/1715/index.htm

CODE = 1715

It can
  - Mount Google Drive
  - Load htmls from Google Drive (saved in the last time processing if exist) 
  - (Starting from the last episode) or from the 1st episodes
  - Save novel's htmls and parsed htmls into "syosetu" folder in Google Drive
  - Save epub and mobi into "syosetu/epub" and "syosetu/mobi" folder in Google Drive

 

In [None]:
##### INPUT AREA
#@title INPUT { run: "auto" }
CODE="2428" #@param {type:"string"}

In [None]:
# Mount gdrive 
from google.colab import drive
drive.mount("/content/gdrive/")

In [None]:
!pip install chinese-converter

In [None]:
##### Initialize varibles
import chinese_converter
from bs4 import BeautifulSoup
import glob
import os
import subprocess
import requests
import shutil
import codecs

# Init variables
BASE_URL= f"https://www.wenku8.net/novel/2/{CODE}/"
ORG_DIR= f"{CODE}_org"
DRIVE_DIR = "syosetu/wenku"
EPISODES_URLs = []
CHAPTER_BEG = 0
CHAPTER_NUM = 1
TITLE = ""
CREATOR = ""
EPUB_NAME=f"{TITLE}.epub"
MOBI_NAME=f"{TITLE}.mobi"

# Make colab directories
!mkdir -p $CODE
!mkdir -p $ORG_DIR

In [None]:
# Create google drive directories
try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}")
except Exception as e:
    print(e)

try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/{CODE}")
except Exception as e:
    print(e)

try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/{ORG_DIR}")
except Exception as e:
    print(e)

try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub")
except Exception as e:
    print(e)

try:
    os.mkdir(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi")
except Exception as e:
    print(e)

In [None]:
# Clone google drive files to colab 
for html_path in glob.glob(f"/content/gdrive/My Drive/{DRIVE_DIR}/{CODE}/*.html"):
  shutil.copy(html_path, f"{CODE}/{os.path.basename(html_path)}")

In [None]:
##############################
##### Download main.html to update TITLE, CREATOR, CHAPTER_NUM
##############################
# Download query TITLE, CREATOR, CHAPTER_NUM

!curl $BASE_URL > main.html
with codecs.open("main.html", 'r', encoding='gbk',errors='ignore') as f:
    text1 = f.read() 
    soup1 = BeautifulSoup(text1, 'html.parser')
    TITLE  = chinese_converter.to_traditional(str(soup1.find(id="title").string))
    CREATOR  = chinese_converter.to_traditional(str(soup1.find(id="info").string))
    EPISODES_URLs = soup1.table.find_all("a")
os.remove("main.html")

# update paramter
CHAPTER_BEG = len(glob.glob(f'{CODE}/*.html'))
CHAPTER_NUM = len(EPISODES_URLs)
EPUB_NAME=f"{TITLE}.epub"
MOBI_NAME=f"{TITLE}.mobi"


In [None]:
##### Check parameters 
print (chinese_converter.to_traditional(TITLE))
print(CREATOR)
print([CHAPTER_BEG, CHAPTER_NUM])

In [None]:
##############################
##### Download HTMLs
##############################
for i in range(CHAPTER_BEG, CHAPTER_NUM):
  basename = os.path.basename(EPISODES_URLs[i].get("href"))
  url = f"{BASE_URL}/{basename}"
  file_name = f"{ORG_DIR}/{i+1:05d}.html"
  print(f"downloading: {i}/{CHAPTER_NUM}: {url}")
  !curl $url > $file_name

In [None]:
##############################
##### Parse HTML
##############################
TEMPLATE = """
<html>
  <head>
    <meta charset="UTF-16">
    {0}
  </head>
  <body>
    <h1>{1}</h1>
    {2}
    <hr/>
  </body>
</html>
"""

def extract_article(fname):
  with codecs.open(f"{ORG_DIR}/{fname}", 'r', encoding='gbk',errors='ignore') as f:
    text = f.read()
  with open(f"{CODE}/{fname}", "w") as f:
    soup = BeautifulSoup(text, 'html.parser')
    chapterTitle = chinese_converter.to_traditional(str(soup.find("title")))
    episodeTitle = chinese_converter.to_traditional(str(soup.find("div", id = "title").getText()))
    # processs
    content = soup.find("div", id = "content")
    for c in content.findAll("ul", id = "contentdp"):
      c.extract(); # remove unwanted
    content = str(content)
    content = chinese_converter.to_traditional(content)
    f.write(TEMPLATE.format(chapterTitle, episodeTitle, content))   
    # print(chapterTitle, episodeTitle)

fnames = [os.path.basename(f) for f in glob.glob(f'{ORG_DIR}/*.html')]
[extract_article(f) for f in fnames]

In [None]:
##############################
##### Convert HTML to epud
##############################
meta1 = f'--metadata=title:"{TITLE}"'
meta2 = f'--metadata=author:"{CREATOR}"' 
meta3 = f'--metadata=lang:"zh"'
html_paths = sorted(glob.glob(f'{CODE}/*.html'))  

cmd = ['pandoc', '-o', EPUB_NAME, meta1, meta2, meta3]
cmd.extend(html_paths)
# subprocess.run(cmd)
subprocess.call(cmd)


In [None]:
##############################
##### Install  https://calibre-ebook.com/download_linux
##############################
!sudo -v && wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sudo sh /dev/stdin

In [None]:
##############################
##### Convert epub to mobi
##### Refresh "Files" when done
##############################
cmd = ["ebook-convert",EPUB_NAME,MOBI_NAME]
subprocess.call(cmd) 

In [None]:
##############################
# Copy colab files to google drive
##############################
for html_path in glob.glob(f'{CODE}/*.html'):
  shutil.copy(html_path, f"/content/gdrive/My Drive/{DRIVE_DIR}/{html_path}")
for html_path in glob.glob(f'{ORG_DIR}/*.html'):
  shutil.copy(html_path, f"/content/gdrive/My Drive/{DRIVE_DIR}/{html_path}")

In [None]:
if os.path.exists(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub/{EPUB_NAME}"):
  os.remove(f"/content/gdrive/My Drive/{DRIVE_DIR}/epub/{EPUB_NAME}") # remove old epub
shutil.copy(EPUB_NAME, f"/content/gdrive/My Drive/{DRIVE_DIR}/epub")

In [None]:
if os.path.exists(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi/{MOBI_NAME}"):
  os.remove(f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi/{MOBI_NAME}") # remove old mobi
shutil.copy(MOBI_NAME, f"/content/gdrive/My Drive/{DRIVE_DIR}/mobi")