Skip to content

Commit

Permalink
Implement rendering HTML from unpacked CHM file
Browse files Browse the repository at this point in the history
Closes #16
  • Loading branch information
dottedmag committed Feb 11, 2020
1 parent 48f5ba1 commit c8f186d
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 51 deletions.
161 changes: 117 additions & 44 deletions archmage/CHM.py
Expand Up @@ -27,6 +27,7 @@
import errno
import string
import tempfile
import os.path
from enum import Enum, auto

import archmage
Expand Down Expand Up @@ -58,14 +59,74 @@ class Action(Enum):
PARENT_RE = re.compile(r"(^|/|\\)\.\.(/|\\|$)")


class CHMFile:
class FileSource:
def __init__(self, filename):
self._chm = chmlib.chm_open(filename)

def listdir(self):
def get_name(chmfile, ui, out):
path = ui.path.decode("utf-8")
if path != "/":
out.append(path)
return chmlib.CHM_ENUMERATOR_CONTINUE

out = []
if (
chmlib.chm_enumerate(
self._chm, chmlib.CHM_ENUMERATE_ALL, get_name, out
)
== 0
):
sys.exit("UnknownError: CHMLIB or PyCHM bug?")
return out

def get(self, name):
result, ui = chmlib.chm_resolve_object(self._chm, name.encode("utf-8"))
if result != chmlib.CHM_RESOLVE_SUCCESS:
return None
size, content = chmlib.chm_retrieve_object(self._chm, ui, 0, ui.length)
if size == 0:
return None
return content

def close(self):
chmlib.chm_close(self._chm)


class DirSource:
def __init__(self, dirname):
self.dirname = dirname

def listdir(self):
entries = []
for dir, _, files in os.walk(self.dirname):
for f in files:
entries.append(
"/" + os.path.relpath(os.path.join(dir, f), self.dirname)
)
return entries

def get(self, filename):
with open(self.dirname + filename, "rb") as fh:
if fh is None:
return None
return fh.read()

def close(self):
pass


class CHM:
"""Class that represent CHM content from directory"""

def __init__(self, name):
self.cache = {}
# Name of source directory with CHM content
if os.path.isdir(name):
self.source = DirSource(name)
else:
self.source = FileSource(name)
self.sourcename = name
self._chm = chmlib.chm_open(name)
# Import variables from config file into namespace
exec(
compile(
Expand All @@ -85,29 +146,15 @@ def __init__(self, name):
self.contents = SitemapFile(self.topicstree).parse()

def close(self):
chmlib.chm_close(self._chm)
self.source.close()

def entries(self):
if "entries" not in self.cache:
self.cache["entries"] = self._entries()
return self.cache["entries"]

def _entries(self):
def get_name(chmfile, ui, out):
path = ui.path.decode("utf-8")
if path != "/":
out.append(path)
return chmlib.CHM_ENUMERATOR_CONTINUE

out = []
if (
chmlib.chm_enumerate(
self._chm, chmlib.CHM_ENUMERATE_ALL, get_name, out
)
== 0
):
sys.exit("UnknownError: CHMLIB or PyCHM bug?")
return out
return self.source.listdir()

# retrieves the list of HTML files contained into the CHM file, **in order**
# (that's the important bit).
Expand All @@ -133,7 +180,11 @@ def _image_urls(self):
out = []
image_catcher = ImageCatcher()
for file in self.html_files():
image_catcher.feed(CHMEntry(self, file).correct())
image_catcher.feed(
Entry(
self.source, file, self.filename_case, self.restore_framing
).correct()
)
for image_url in image_catcher.imgurls:
if not out.count(image_url):
out.append(image_url)
Expand Down Expand Up @@ -166,7 +217,13 @@ def topics(self):
def _topics(self):
for e in self.entries():
if e.lower().endswith(".hhc"):
return CHMEntry(self, e, frontpage=self.frontpage()).get()
return Entry(
self.source,
e,
self.filename_case,
self.restore_framing,
frontpage=self.frontpage(),
).get()

# use first page as deftopic. Note: without heading slash
def deftopic(self):
Expand Down Expand Up @@ -267,11 +324,21 @@ def extract_entry(self, entry, output_file, destdir=".", correct=False):
# write CHM entry content into the file, corrected or as is
if correct:
open(os.path.join(destdir, fname), "wb").write(
CHMEntry(self, entry).correct()
Entry(
self.source,
entry,
self.filename_case,
self.restore_framing,
).correct()
)
else:
open(os.path.join(destdir, fname), "wb").write(
CHMEntry(self, entry).get()
Entry(
self.source,
entry,
self.filename_case,
self.restore_framing,
).get()
)

def extract_entries(self, entries=[], destdir=".", correct=False):
Expand Down Expand Up @@ -305,7 +372,12 @@ def dump_html(self, output=sys.stdout):
# if entry is auxiliary file, than skip it
if re.match(self.aux_re, e):
continue
print(CHMEntry(self, e).get(), file=output)
print(
Entry(
self.source, e, self.filename_case, self.restore_framing
).get(),
file=output,
)

def chm2text(self, output=sys.stdout):
"""Convert CHM into Single Text file"""
Expand All @@ -315,7 +387,11 @@ def chm2text(self, output=sys.stdout):
continue
# to use this function you should have 'lynx' or 'elinks' installed
chmtotext(
input=CHMEntry(self, e).get(), cmd=self.chmtotext, output=output
input=Entry(
self.source, e, self.filename_case, self.restore_framing
).get(),
cmd=self.chmtotext,
output=output,
)

def htmldoc(self, output, format=Action.CHM2HTML):
Expand Down Expand Up @@ -361,31 +437,28 @@ def htmldoc(self, output, format=Action.CHM2HTML):
shutil.rmtree(path=tempdir)


class CHMEntry(object):
class Entry(object):
"""Class for CHM file entry"""

def __init__(self, parent, name, frontpage="index.html"):
# parent CHM file
self.parent = parent
def __init__(
self,
source,
name,
filename_case,
restore_framing,
frontpage="index.html",
):
# Entry source
self.source = source
# object inside CHM file
self.name = name
self.filename_case = filename_case
self.restore_framing = restore_framing
# frontpage name to substitute
self.frontpage = os.path.basename(frontpage)

def read(self):
"""Read CHM entry content"""
result, ui = chmlib.chm_resolve_object(
self.parent._chm, self.name.encode("utf-8")
)
if result != chmlib.CHM_RESOLVE_SUCCESS:
return None

size, content = chmlib.chm_retrieve_object(
self.parent._chm, ui, 0, ui.length
)
if size == 0:
return None
return content
return self.source.get(self.name)

def lower_links(self, text):
"""Links to lower case"""
Expand Down Expand Up @@ -416,7 +489,7 @@ def correct(self):
# If entry is a html page?
if re.search("(?i)\\.html?$", self.name) and data is not None:
# lower-casing links if needed
if self.parent.filename_case:
if self.filename_case:
data = self.lower_links(data)

# Delete unwanted HTML elements.
Expand All @@ -443,10 +516,10 @@ def get(self):
# If entry is a html page?
if re.search("(?i)\\.html?$", self.name) and data is not None:
# lower-casing links if needed
if self.parent.filename_case:
if self.filename_case:
data = self.lower_links(data)
# restore framing if that option is set in config file
if self.parent.restore_framing:
if self.restore_framing:
data = self.add_restoreframing_js(self.name[1:], data)
if data is not None:
return data
Expand Down
9 changes: 2 additions & 7 deletions archmage/cli.py
Expand Up @@ -54,7 +54,7 @@
import getopt

import archmage
from archmage.CHM import CHMFile, Action
from archmage.CHM import CHM, Action

# Return codes
OK = 0
Expand Down Expand Up @@ -180,12 +180,7 @@ def main():
if not os.path.exists(options.chmfile):
sys.exit("No such file: %s" % options.chmfile)

if os.path.isdir(options.chmfile):
sys.exit(
"A regular files is expected, got directory: %s" % options.chmfile
)

source = CHMFile(options.chmfile)
source = CHM(options.chmfile)

if options.mode == Action.DUMPHTML:
source.dump_html()
Expand Down
20 changes: 20 additions & 0 deletions tests/example/Documents/Table of Contents.hhc
@@ -0,0 +1,20 @@
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<HTML>
<HEAD>
<meta name="GENERATOR" content="Microsoft&reg; HTML Help Workshop 4.1">
<!-- Sitemap 1.0 -->
</HEAD><BODY>
<OBJECT type="text/site properties">
<param name="ImageType" value="Folder">
</OBJECT>
<UL>
<LI> <OBJECT type="text/sitemap">
<param name="Name" value="page 1">
<param name="Local" value="../page%201.html">
</OBJECT>
<LI> <OBJECT type="text/sitemap">
<param name="Name" value="page 2">
<param name="Local" value="../page%202.html">
</OBJECT>
</UL>
</BODY></HTML>
9 changes: 9 additions & 0 deletions tests/example/page 1.html
@@ -0,0 +1,9 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title></title>
</head>
<body>
<h1>Page 1</h1>
</body>
</html>
10 changes: 10 additions & 0 deletions tests/example/page 2.html
@@ -0,0 +1,10 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<title></title>
</head>
<body>
<h1>Page 2</h1>
</body>
</html>

11 changes: 11 additions & 0 deletions tests/test_openclose.py
Expand Up @@ -25,3 +25,14 @@ def test_extract():
assert (t / f).exists()

assert "Page 1" in (t / "page 1.html").read_text()


def test_render_extracted():
with TempDir() as tmpdir:
t = tmpdir / "example_html"

sys.argv = ["extract", "tests/example", t]
archmage.cli.main()

for f in ["index.html", "page 1.html"]:
assert (t / f).exists()

0 comments on commit c8f186d

Please sign in to comment.