mobiml2xhtml.py

# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab


# this program works in concert with the output from KindleUnpack

'''
Convert from Mobi ML to XHTML
'''
from __future__ import unicode_literals, division, absolute_import, print_function

import os
import re
from utilities import file_open

SPECIAL_HANDLING_TAGS = {
    '?xml'     : ('xmlheader', -1),
    '!--'      : ('comment', -3),
    '!DOCTYPE' : ('doctype', -1),
}

SPECIAL_HANDLING_TYPES = ['xmlheader', 'doctype', 'comment']
SELF_CLOSING_TAGS = ['br' , 'hr', 'input', 'img', 'image', 'meta', 'spacer', 'link', 'frame', 'base', 'col', 'reference']
DOCTYPE_DECL = '<?xml version="1.0"?>\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
XML_NS = 'http://www.w3.org/1999/xhtml'

class MobiMLConverter(object):

    PAGE_BREAK_PAT = re.compile(r'(<[/]{0,1}mbp:pagebreak\s*[/]{0,1}>)+', re.IGNORECASE)
    IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')

    def __init__(self, filename, out_enc):
        self.base_css_rules =  'blockquote { margin: 0em 0em 0em 1.25em }\n'
        self.base_css_rules += 'p { margin: 0em }\n'
        self.base_css_rules += '.bold { font-weight: bold }\n'
        self.base_css_rules += '.italic { font-style: italic }\n'
        self.base_css_rules += '.mbp_pagebreak { page-break-after: always; margin: 0; display: block }\n'
        self.tag_css_rules = {}
        self.tag_css_rule_cnt = 0
        self.path = []
        self.filename = filename

        if out_enc is not None:
            try:
                self.wipml = file_open(self.filename, 'r', encoding=out_enc).read()
            except UnicodeDecodeError:
                out_enc = None
        if out_enc is None:
            encodings = ['utf-8', 'windows-1252']
            for enc in encodings:
                try:
                    self.wipml = file_open(self.filename, 'r', encoding=enc).read()
                    print('Guessing markup character encoding')
                    break
                except UnicodeDecodeError:
                    next
        if out_enc is None:
            raise

        self.pos = 0
        self.opfname = self.filename.rsplit('.',1)[0] + '.opf'
        self.opos = 0
        self.meta = ''
        self.cssname = os.path.join(os.path.dirname(self.filename),'styles.css')
        self.current_font_size = 3
        self.font_history = []

    def cleanup_html(self):
        self.wipml = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.wipml)
        self.wipml = self.wipml.replace('\r\n', '\n')
        self.wipml = self.wipml.replace('> <', '>\n<')
        self.wipml = self.wipml.replace('<mbp: ', '<mbp:')
        self.wipml = self.wipml.replace('<br></br>','<br/>')

    def replace_page_breaks(self):
        self.wipml = self.PAGE_BREAK_PAT.sub(
            '<div class="mbp_pagebreak" />',
            self.wipml)

    # parse leading text of ml and tag
    def parseml(self):
        p = self.pos
        if p >= len(self.wipml):
            return None
        if self.wipml[p] != '<':
            res = self.wipml.find('<',p)
            if res == -1 :
                res = len(self.wipml)
            self.pos = res
            return self.wipml[p:res], None
        # handle comment as a special case to deal with multi-line comments
        if self.wipml[p:p+4] == '<!--':
            te = self.wipml.find('-->',p+1)
            if te != -1:
                te = te+2
        else :
            te = self.wipml.find('>',p+1)
            ntb = self.wipml.find('<',p+1)
            if ntb != -1 and ntb < te:
                self.pos = ntb
                return self.wipml[p:ntb], None
        self.pos = te + 1
        return None, self.wipml[p:te+1]

    # parses string version of tag to identify its name,
    # its type 'begin', 'end' or 'single',
    # plus build a hashtable of its attributes
    # code is written to handle the possiblity of very poor formating
    def parsetag(self, s):
        p = 1
        # get the tag name
        tname = None
        ttype = None
        tattr = {}
        while s[p:p+1] == ' ' :
            p += 1
        if s[p:p+1] == '/':
            ttype = 'end'
            p += 1
            while s[p:p+1] == ' ' :
                p += 1
        b = p
        while s[p:p+1] not in ('>', '/', ' ', '"', "'", "\r", "\n") :
            p += 1
        tname=s[b:p].lower()
        if tname == '!doctype':
            tname = '!DOCTYPE'
        # special cases
        if tname in SPECIAL_HANDLING_TAGS.keys():
            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
            tattr['special'] = s[p:backstep]
        if ttype is None:
            # parse any attributes
            while s.find('=',p) != -1 :
                while s[p:p+1] == ' ' :
                    p += 1
                b = p
                while s[p:p+1] != '=' :
                    p += 1
                aname = s[b:p].lower()
                aname = aname.rstrip(' ')
                p += 1
                while s[p:p+1] == ' ' :
                    p += 1
                if s[p:p+1] in ('"', "'") :
                    p = p + 1
                    b = p
                    while s[p:p+1] not in ('"', "'") :
                        p += 1
                    val = s[b:p]
                    p += 1
                else :
                    b = p
                    while s[p:p+1] not in ('>', '/', ' ') :
                        p += 1
                    val = s[b:p]
                tattr[aname] = val
        # label beginning and single tags
        if ttype is None:
            ttype = 'begin'
            if s.find(' /',p) >= 0:
                ttype = 'single_ext'
            elif s.find('/',p) >= 0:
                ttype = 'single'
        return ttype, tname, tattr

    # main routine to convert from mobi markup language to html
    def processml(self):

        # are these really needed
        html_done = False
        head_done = False
        body_done = False

        skip = False

        htmlstr = ''
        self.replace_page_breaks()
        self.cleanup_html()

        # now parse the cleaned up ml into standard xhtml
        while True:

            r = self.parseml()
            if not r:
                break

            text, tag = r

            if text:
                if not skip:
                    htmlstr += text

            if tag:
                ttype, tname, tattr = self.parsetag(tag)

                # If we run into a DTD or xml declarations inside the body ... bail.
                if tname in SPECIAL_HANDLING_TAGS.keys() and tname != 'comment' and body_done:
                    htmlstr += '\n</body></html>'
                    break

                # make sure self-closing tags actually self-close
                if ttype == 'begin' and tname in SELF_CLOSING_TAGS:
                    ttype = 'single'

                # make sure any end tags of self-closing tags are discarded
                if ttype == 'end' and tname in SELF_CLOSING_TAGS:
                    continue

                # remove embedded guide and references from old mobis
                if tname in ('guide', 'ncx', 'reference') and ttype in ('begin', 'single', 'single_ext'):
                    tname = 'removeme:{0}'.format(tname)
                    tattr = None
                if tname in ('guide', 'ncx', 'reference', 'font', 'span') and ttype == 'end':
                    if self.path[-1] == 'removeme:{0}'.format(tname):
                        tname = 'removeme:{0}'.format(tname)
                        tattr = None

                # Get rid of font tags that only have a color attribute.
                if tname == 'font' and ttype in ('begin', 'single', 'single_ext'):
                    if 'color' in tattr.keys() and len(tattr.keys()) == 1:
                        tname = 'removeme:{0}'.format(tname)
                        tattr = None

                # Get rid of empty spans in the markup.
                if tname == 'span' and ttype in ('begin', 'single', 'single_ext') and not len(tattr):
                    tname = 'removeme:{0}'.format(tname)

                # need to handle fonts outside of the normal methods
                # so fonts tags won't be added to the self.path since we keep track
                # of font tags separately with self.font_history
                if tname == 'font' and ttype == 'begin':
                    # check for nested font start tags
                    if len(self.font_history) > 0 :
                        # inject a font end tag
                        taginfo = ('end', 'font', None)
                        htmlstr += self.processtag(taginfo)
                    self.font_history.append((ttype, tname, tattr))
                    # handle the current font start tag
                    taginfo = (ttype, tname, tattr)
                    htmlstr += self.processtag(taginfo)
                    continue

                # check for nested font tags and unnest them
                if tname == 'font' and ttype == 'end':
                    self.font_history.pop()
                    # handle this font end tag
                    taginfo = ('end', 'font', None)
                    htmlstr += self.processtag(taginfo)
                    # check if we were nested
                    if len(self.font_history) > 0:
                        # inject a copy of the most recent font start tag from history
                        taginfo = self.font_history[-1]
                        htmlstr += self.processtag(taginfo)
                    continue

                # keep track of nesting path
                if ttype == 'begin':
                    self.path.append(tname)
                elif ttype == 'end':
                    if tname != self.path[-1]:
                        print('improper nesting: ', self.path, tname, ttype)
                        if tname not in self.path:
                            # handle case of end tag with no beginning by injecting empty begin tag
                            taginfo = ('begin', tname, None)
                            htmlstr += self.processtag(taginfo)
                            print("     - fixed by injecting empty start tag ", tname)
                            self.path.append(tname)
                        elif len(self.path) >  1 and tname == self.path[-2]:
                            # handle case of dangling missing end
                            taginfo = ('end', self.path[-1], None)
                            htmlstr += self.processtag(taginfo)
                            print("     - fixed by injecting end tag ", self.path[-1])
                            self.path.pop()
                    self.path.pop()

                if tname == 'removeme:{0}'.format(tname):
                    if ttype in ('begin', 'single', 'single_ext'):
                        skip = True
                    else:
                        skip = False
                else:
                    taginfo = (ttype, tname, tattr)
                    htmlstr += self.processtag(taginfo)

                # handle potential issue of multiple html, head, and body sections
                if tname == 'html' and ttype == 'begin' and not html_done:
                    htmlstr += '\n'
                    html_done = True

                if tname == 'head' and ttype == 'begin' and not head_done:
                    htmlstr += '\n'
                    htmlstr += '<title></title>\n'
                    # also add in metadata and style link tags
                    htmlstr += self.meta
                    htmlstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
                    head_done = True

                if tname == 'body' and ttype == 'begin' and not body_done:
                    htmlstr += '\n'
                    body_done = True

        # handle issue of possibly missing html, head, and body tags
        # I have not seen this but the original did something like this so ...
        if not body_done:
            htmlstr = '<body>\n' + htmlstr + '</body>\n'
        if not head_done:
            headstr = '<head>\n'
            htmlstr += '<title></title>\n'
            headstr += self.meta
            headstr += '<link href="styles.css" rel="stylesheet" type="text/css" />\n'
            headstr += '</head>\n'
            htmlstr = headstr + htmlstr
        if not html_done:
            htmlstr = '<html>\n' + htmlstr + '</html>\n'

        # finally add DOCTYPE info
        htmlstr = DOCTYPE_DECL + htmlstr

        css = self.base_css_rules
        for cls, rule in self.tag_css_rules.items():
            css += '.%s { %s }\n' % (cls, rule)

        return (htmlstr, css, self.cssname)

    def ensure_unit(self, raw, unit='px'):
        if re.search(r'\d+$', raw) is not None:
            raw += unit
        return raw

    # flatten possibly modified tag back to string
    def taginfo_tostring(self, taginfo):
        (ttype, tname, tattr) = taginfo
        if ttype is None or tname is None:
            return ''
        if ttype == 'end':
            return '</%s>' % tname
        if ttype in SPECIAL_HANDLING_TYPES and tattr is not None and 'special' in tattr.keys():
            info = tattr['special']
            if ttype == 'comment':
                return '<%s %s-->' % tname, info
            else:
                return '<%s %s>' % tname, info
        res = []
        res.append('<%s' % tname)
        if tattr is not None:
            for key in tattr.keys():
                res.append(' %s="%s"' % (key, tattr[key]))
        if ttype == 'single':
            res.append('/>')
        elif ttype == 'single_ext':
            res.append(' />')
        else :
            res.append('>')
        return "".join(res)

    # routines to convert from mobi ml tags atributes to xhtml attributes and styles
    def processtag(self, taginfo):
        # Converting mobi font sizes to numerics
        size_map = {
            'xx-small': '1',
            'x-small': '2',
            'small': '3',
            'medium': '4',
            'large': '5',
            'x-large': '6',
            'xx-large': '7',
            'normal': '3',
            }

        size_to_em_map = {
            '1': '.65em',
            '2': '.75em',
            '3': '1em',
            '4': '1.125em',
            '5': '1.25em',
            '6': '1.5em',
            '7': '2em',
            }

        # current tag to work on
        (ttype, tname, tattr) = taginfo
        if not tattr:
            tattr = {}

        styles = []

        if tname is None or tname.startswith('removeme'):
            return ''

        if tname == 'html' and not len(tattr):
            tattr['xmlns'] = XML_NS

        # have not seen an example of this yet so keep it here to be safe
        # until this is better understood
        if tname in ('country-region', 'place', 'placetype', 'placename',
                'state', 'city', 'street', 'address', 'content'):
            tname = 'div' if tname == 'content' else 'span'
            for key in tattr.keys():
                tattr.pop(key)

        # handle general case of style, height, width, bgcolor in any tag
        if 'style' in tattr.keys():
            style = tattr.pop('style').strip()
            if style:
                styles.append(style)

        if 'align' in tattr.keys():
            align = tattr.pop('align').strip()
            if align:
                if tname in ('table', 'td', 'tr'):
                    pass
                else:
                    styles.append('text-align: %s' % align)

        if 'height' in tattr.keys():
            height = tattr.pop('height').strip()
            if height and '<' not in height and '>' not in height and re.search(r'\d+', height):
                if tname in ('table', 'td', 'tr'):
                    pass
                elif tname == 'img':
                    tattr['height'] = height
                else:
                    styles.append('margin-top: %s' % self.ensure_unit(height))

        if 'width' in tattr.keys():
            width = tattr.pop('width').strip()
            if width and re.search(r'\d+', width):
                if tname in ('table', 'td', 'tr'):
                    pass
                elif tname == 'img':
                    tattr['width'] =  width
                else:
                    styles.append('text-indent: %s' % self.ensure_unit(width))
                    if width.startswith('-'):
                        styles.append('margin-left: %s' % self.ensure_unit(width[1:]))

        if 'bgcolor' in tattr.keys():
            # no proprietary html allowed
            if tname == 'div':
                del tattr['bgcolor']

        elif tname == 'font':
            # Change font tags to span tags
            tname = 'span'
            if ttype in ('begin', 'single', 'single_ext'):
                # move the face attribute to css font-family
                if 'face' in tattr.keys():
                    face = tattr.pop('face').strip()
                    styles.append('font-family: "%s"' % face)

                    # Monitor the constantly changing font sizes, change them to ems and move
                    # them to css. The following will work for 'flat' font tags, but nested font tags
                    # will cause things to go wonky. Need to revert to the parent font tag's size
                    # when a closing tag is encountered.
                if 'size' in tattr.keys():
                    sz = tattr.pop('size').strip().lower()
                    try:
                        float(sz)
                    except ValueError:
                        if sz.lower() in size_map.keys():
                            sz = size_map[sz.lower()]
                        # Garbage collector
                        else:
                            sz = '3'
                    else:
                        if sz.startswith('-') or sz.startswith('+'):
                            sz = self.current_font_size + float(sz)
                            if sz > 7:
                                sz = 7
                            elif sz < 1:
                                sz = 1
                            sz = str(int(sz))
                    styles.append('font-size: %s' % size_to_em_map[sz])
                    self.current_font_size = int(sz)

        elif tname == 'img':
            for attr in ('width', 'height'):
                if attr in tattr:
                    val = tattr[attr]
                    if val.lower().endswith('em'):
                        try:
                            nval = float(val[:-2])
                            nval *= 16 * (168.451/72)  # Assume this was set using the Kindle profile
                            tattr[attr] = "%dpx"%int(nval)
                        except Exception:
                            del tattr[attr]
                    elif val.lower().endswith('%'):
                        del tattr[attr]

        # convert the anchor tags
        if 'filepos-id' in tattr:
            tattr['id'] = tattr.pop('filepos-id')
            if 'name' in tattr and tattr['name'] != tattr['id']:
                tattr['name'] = tattr['id']

        if 'filepos' in tattr:
            filepos = tattr.pop('filepos')
            try:
                tattr['href'] = "#filepos%d" % int(filepos)
            except ValueError:
                pass

        if styles:
            ncls = None
            rule = '; '.join(styles)
            for sel, srule in self.tag_css_rules.items():
                if srule == rule:
                    ncls = sel
                    break
            if ncls is None:
                self.tag_css_rule_cnt += 1
                ncls = 'rule_%d' % self.tag_css_rule_cnt
                self.tag_css_rules[ncls] = rule
            cls = tattr.get('class', '')
            cls = cls + (' ' if cls else '') + ncls
            tattr['class'] = cls

        # convert updated tag back to string representation
        if len(tattr) == 0:
            tattr = None
        taginfo = (ttype, tname, tattr)
        return self.taginfo_tostring(taginfo)