In [1]:
import fitz
import sys
import os

doc = fitz.open("raw.pdf")                           

Begin function definitions and global variables

In [2]:
from enum import Enum
class textType(Enum):
    Lore = 0
    StatBlock = 1
    Other = 2
    
senseFlag = False

class textProperty(Enum):
    Title = 0
    Red = 1
    Break = 2
    Other = 3

loreWords = ['ore', 'lore', 'salvage', 'alvage', 'Lore', 'Salvage']
statWords = ['ction', 'action', 'eaction', 'reaction', 'egendary actions', 'legendary actions']
notNounList = ['in', 'the', 'of']
attributeList = ['Armor', 'Class', 'Armor Class', 'Hit Points', 'Speed', 
                 'Senses', 'Languages', 'Challenge', 'Proficiency Bonus', 
                 'Damage Immunities', 'Condition Immunities', 'passive']
statList = 'str dex con int wis cha'

In [3]:
def capitalizeStringList(stringList):
    capitalList = []
    for string in stringList:
        if not string in notNounList:
            if not '(' in string:
                capitalList.append(string.capitalize())
            else:
                capitalList.append('(' + string[1:].capitalize())
        else:
            capitalList.append(string)
    return capitalList

def dashedString(stringList):
    capitalList = []
    for string in stringList:
        if '-' in string:
            dashString = string.split('-')
            dashString = capitalizeStringList(dashString)
            for i in range(len(dashString)):
                if i == 0:
                    fixedString = dashString[i]
                else:
                    fixedString = fixedString + '-' + dashString[i]
            capitalList.append(fixedString)
        else:
            capitalList.append(string)
    return capitalList

def formatName(name):
    flag = False
    if '-' in name:
        flag = True
        
    words = name.split()
    words = capitalizeStringList(words)
        
    if flag:
        words = dashedString(words)
        
    for i in range(len(words)):
        if i == 0:
            fixedName = words[i]
        else:
            fixedName = fixedName + ' ' + words[i]
    #print(fixedName)
    return fixedName

def checkTextType(s):
    if s['font'] == 'BauerBodoniLT-Bold-SC700':
        if s["text"].lower() in loreWords:
            return textType.Lore
        else:
            return textType.StatBlock
    elif s['font'] == 'GoudyTextMT':
        return textType.Lore
    elif s["color"] == 0x811d2e and s["text"] == 'GM Advice:':
        return textType.Lore
    else:
        return textType.Other

In [4]:
def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

def parseMonsterName(page, output = sys.stdout, descriptive = False):
    flag = False
    text = ""
    blocks = page.get_text("dict", flags=11)["blocks"]
    for b in blocks:  # iterate through the text blocks
        for l in b["lines"]:  # iterate through the text lines
            for s in l["spans"]:  # iterate through the text spans
                #print("%06x" % s["color"])
                if s["color"] == 0x811d2e:
                    if s["text"] == ' STR ' and int(s["size"]) == 9:
                        out.write(text + '\n')
                        flag = True
                        text = ""
                    if descriptive:
                        print("")
                        font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                            s["font"],  # font name
                            flags_decomposer(s["flags"]),  # readable font flags
                            s["size"],  # font size
                            s["color"],  # font color
                        )
                        print("Text: '%s'" % s["text"])  # simple print of text
                        print(font_properties)
                    if (int(s["size"]) == 16 or int(s["size"]) == 11) and not flag:
                        text = text + s["text"].lower()
                        if text in loreWords:
                            text = ""
                    elif flag:
                        flag = False

def parsePages(document, output = sys.stdout, pageNum = 0, descriptive = False):
    if(pageNum == 0):
        for page in document:
            parseMonsterName(page, output, descriptive)
    elif pageNum > 0:
        page = document[pageNum-1]
        parseMonsterName(page, output, descriptive)

Parses for monster names to create a list of those names

In [5]:
# out = open('output.txt', 'w', encoding='utf-8')

# parsePages(doc)

# out.close()

Creates a generally clean raw text file of the entire pdf

In [6]:
# out = open('goodRaw.txt', 'w', encoding='utf-8')
# for page in doc:
#     text = page.get_text("text", sort=False)
#     out.write(text)

# out.close()

Look at text properties

In [13]:
statBlockFlag = False
text = ""
page = doc[34] #pagenum - 1
blocks = page.get_text("dict", flags=11)["blocks"]
for b in blocks:  # iterate through the text blocks
    for l in b["lines"]:  # iterate through the text lines
        for s in l["spans"]:  # iterate through the text spans
            #print("%06x" % s["color"])
            print("")
            font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                s["font"],  # font name
                flags_decomposer(s["flags"]),  # readable font flags
                s["size"],  # font size
                s["color"],  # font color
            )
            print("Text: '%s'" % s["text"])  # simple print of text
            print(font_properties)
            print(checkTextType(s))
            if s["color"] == 0x811d2e:
                if s["text"] == ' STR ' and int(s["size"]) == 9:
                    print(text)
                    statBlockFlag = True
                    text = ""
                if (int(s["size"]) == 16 or int(s["size"]) == 11) and not statBlockFlag:
                    text = text + s["text"].lower()
                    if text in loreWords:
                        text = ""
                elif statBlockFlag:
                    statBlockFlag = False



Text: '31'
Font: 'BauerBodoniLT-Bold' (serifed, proportional, bold), size 8.4208, color #000000
textType.Other

Text: 'G'
Font: 'BauerBodoniLT-Black-SC70' (serifed, proportional), size 7.5, color #000000
textType.Other

Text: 'rim'
Font: 'BauerBodoniLT-Black-SC70' (serifed, proportional), size 5.25, color #000000
textType.Other

Text: ' H'
Font: 'BauerBodoniLT-Black-SC70' (serifed, proportional), size 7.5, color #000000
textType.Other

Text: 'ollow'
Font: 'BauerBodoniLT-Black-SC70' (serifed, proportional), size 5.25, color #000000
textType.Other

Text: ' '
Font: 'BodoniSvtyTwoSCITCTT-Boo' (serifed, proportional), size 7.5, color #000000
textType.Other

Text: 'm'
Font: 'BauerBodoniLT-Black-SC70' (serifed, proportional), size 7.5, color #000000
textType.Other

Text: 'onster'
Font: 'BauerBodoniLT-Black-SC70' (serifed, proportional), size 5.25, color #000000
textType.Other

Text: ' G'
Font: 'BauerBodoniLT-Black-SC70' (serifed, proportional), size 7.5, color #000000
textType.Other

Text: '

Old implementation

In [8]:
# lore = open('lore.txt', 'w', encoding='utf-8')
# statBlocks = open('statBlocks.txt', 'w', encoding='utf-8')
# textTypeFlag = textType.Other
# redFlag = False
# buffer = ""
# if s['font'] == 'GoudyTextMT':
#     textTypeFlag = textType.Lore
#     #print line to lore
#     lore.write(s['text'] + '\n')
# elif s["color"] == 0x811d2e:
#     if s['text'] == 'GM Advice:':
#         textTypeFlag = textType.Lore
#         #print line to lore
#         lore.write(s['text'] + '\n')
#     elif not redFlag:
#         redFlag = True
#         buffer = s['text'].lower()
#     elif redFlag and (s['text'] in loreWords):
#         buffer = buffer + ['text'].lower()
#         #print buffer to lore
#         lore.write(buffer + '\n')
#         textTypeFlag = textType.Lore
#         redFlag = False
#     elif redFlag and not (s['text'] in loreWords):
#         buffer = buffer + ['text'].lower()
#         #print buffer to monsters
#         statBlocks.write(buffer + '\n')
#         textTypeFlag = textType.StatBlock
#         redFlag = False
#     else:
#         #print according to textTypeFlag
#         if textTypeFlag == textType.Lore:
#             lore.write(s['text'] + '\n')
#         elif: textTypeFlag == textType.StatBlock:
#             statBlocks.write(s['text'] + '\n')
# lore.close()
# statBlocks.close()

In [7]:
nameList = []
with open('MonsterNameList.txt') as names:
    for name in names:
        nameList.append(name.strip('\n'))

def writeStatBlock(line, out):
    #print('Line:' + line)
    line = line.strip('\n')
    line = line.strip()
    if line.lower() in nameList:
        #Cleaning the Monster Name
        #print("Monster:" + line.lower())
        out.write('===\n')
        out.write(str(formatName(line.lower()))+ '\n')
    elif statList in line.lower():
        #Stat Rows
        split = line.find('cha') + 3
        out.write(line[:split] + '\n')
        out.write(line[split:] + '\n')
    elif (line in attributeList) or ("passive" in line and not "Perception" in line) or (':' in line):
        #(line.strip() in attributeList) or ("passive" in line and not "Perception" in line)
        #(line.endswith(att) for att in attributeList) 
        out.write(line + " ")
    else:
        #All other lines get directly printed, works well with paragraphs of text
        out.write(line + '\n')

def parsePageText(page, loreOutput = sys.stdout, statBlocksOutput = sys.stdout, 
                  textTypeFlag = textType.Other, redFlag = False, buffer = "", sectionTitle = ""):
    blocks = page.get_text("dict", flags=11)["blocks"]
    for b in blocks:  # iterate through the text blocks
        for l in b["lines"]:  # iterate through the text lines
            for s in l["spans"]:  # iterate through the text spans
                if not s['text'] == ' ':
                    if s['font'] == 'GoudyTextMT': #Section titles use this font
                        textTypeFlag = textType.Lore
                        #print line to lore
                        loreOutput.write(s['text'] + '\n')
                        sectionTitle = s['text']
                    elif s["color"] == 0x811d2e: #Red text
                        if s['text'] == 'GM Advice:':#Common red lore text
                            textTypeFlag = textType.Lore
                            #print line to lore
                            loreOutput.write(s['text'] + '\n')
                        elif not redFlag:#Start storing the red text in a buffer
                            #Red text often is broken between lines, combining in buffer to look at whole red text phrase
                            redFlag = True
                            buffer = s['text'].lower()
                        elif redFlag:
                            buffer = buffer + s['text'].lower()
                    else:
                        #reached non-title non-red text
                        #Any previous red text should be stored in buffer to be analyzed
                        if redFlag and (buffer in loreWords):#Red text is a lore related subsection
                            #print buffer to lore
                            loreOutput.write(buffer + '\n')
                            textTypeFlag = textType.Lore
                            redFlag = False
                        elif redFlag:#All other red text is assumed to be stat block related, though there are edge cases
                            #print buffer to staBlocks
                            writeStatBlock(buffer, statBlocksOutput)
                            textTypeFlag = textType.StatBlock
                            redFlag = False
                        #print current line according to textTypeFlag
                        if textTypeFlag == textType.Lore:
                            loreOutput.write(s['text'] + '\n')
                        elif textTypeFlag == textType.StatBlock:
                            #print(s['text'])
                            statBlocksOutput.write(s['text'] + '\n')
                            writeStatBlock(s['text'], statBlocksOutput)
    return (textTypeFlag, redFlag, buffer, sectionTitle)

In [19]:
def recoverpix(doc, item):
    xref = item[0]  # xref of PDF image
    smask = item[1]  # xref of its /SMask

    # special case: /SMask or /Mask exists
    if smask > 0:
        pix0 = fitz.Pixmap(doc.extract_image(xref)["image"])
        if pix0.alpha:  # catch irregular situation
            pix0 = fitz.Pixmap(pix0, 0)  # remove alpha channel
        mask = fitz.Pixmap(doc.extract_image(smask)["image"])

        try:
            pix = fitz.Pixmap(pix0, mask)
        except:  # fallback to original base image in case of problems
            pix = fitz.Pixmap(doc.extract_image(xref)["image"])

        if pix0.n > 3:
            ext = "pam"
        else:
            ext = "png"

        return {  # create dictionary expected by caller
            "ext": ext,
            "colorspace": pix.colorspace.n,
            "image": pix.tobytes(ext),
        }

    # special case: /ColorSpace definition exists
    # to be sure, we convert these cases to RGB PNG images
    if "/ColorSpace" in doc.xref_object(xref, compressed=True):
        pix = fitz.Pixmap(doc, xref)
        pix = fitz.Pixmap(fitz.csRGB, pix)
        return {  # create dictionary expected by caller
            "ext": "png",
            "colorspace": 3,
            "image": pix.tobytes("png"),
        }
    return doc.extract_image(xref)

def parsePageImages(document, page, section_title, pageNum):
    images = page.get_images()
    imageCounter = 0
    workdir = 'namedImages'
    for i in images:
        width = i[2]
        height = i[3]
        image = recoverpix(document, i)
        if len(image["image"]) <= 204800: #Skip images less than 200KB in size
            continue
        if (width == 1293 or width == 1298) and height == 1688: #Skip images of these dimensions (page backgrounds)
            continue
        imgDir = open(f"namedImages\{section_title}_pg{str(pageNum)}_{str(imageCounter)}.png", "wb")
        imgDir.write(image["image"])
        imgDir.close()
        imageCounter = imageCounter + 1


In [11]:
lore = open('lore.txt', 'w', encoding='utf-8')
statBlocks = open('statBlocks.txt', 'w', encoding='utf-8')
text_type = textType.Other
red_text = False
buffer = ""
section_title = ""
# page = doc[9] #pagenum - 1
for i in range(9, 407): #pages 9-407
    lore = open('lore\page%d.txt' % (i-3), 'w', encoding='utf-8')
    statBlocks = open('statBlocks\page%d.txt' % (i-3), 'w', encoding='utf-8')
    page = doc.load_page(i)
    text_type, red_text, buffer, section_title = parsePageText(page, lore, statBlocks, text_type, red_text, buffer, section_title)
    
    #parsePageImages(doc, page, section_title, i)

    lore.close()
    statBlocks.close()
# lore.close()
# statBlocks.close()

In [12]:
# images = page.get_images()
# imageCounter = 0
# workdir = 'namedImages'
# print(images)
# for i in images:
#     image = recoverpix(doc, i)
#     imgDir = open(f"namedImages\{section_title}_{str(imageCounter)}.png", "wb")
#     imgDir.write(image["image"])
#     imgDir.close()
#     imageCounter = imageCounter + 1
    

In [9]:
def checkForProperty(span):
    if span['font'] == 'GoudyTextMT': #Section titles use this font
        return textProperty.Title
    elif span["color"] == 0x811d2e: #Red text
        #Assemble text onto one line, write with a \n at the end
        return textProperty.Red
    elif span['font'] == 'BookmanOldStyle-Bold' or span['font'] == 'BookmanOldStyle-BoldItal' or span['text'] == '• ':#Clump text until the next significant text property is encountered
        return textProperty.Break
    else:
        return textProperty.Other

In [11]:
page = doc[10] #Blightscale: 34, angel of empyreus: 13
blocks = page.get_text("dict", flags=11)["blocks"]

activeOutput = -1
lastProperty = textProperty.Other
buffer = ""
outputBuffer = ["",""] #0 for lore, 1 for statblock
blocks = page.get_text("dict", flags=11)["blocks"]
for b in blocks:  # iterate through the text blocks
    for l in b["lines"]:  # iterate through the text lines
        for s in l["spans"]:  # iterate through the text spans
            if lastProperty == textProperty.Title:
                #Write and clear buffer
                activeOutput = 0
                outputBuffer[activeOutput] = outputBuffer[activeOutput] + buffer + '\n'
                buffer = "" 
            elif lastProperty == textProperty.Red:
                if not s["color"] == 0x811d2e:
                    if buffer == 'GM Advice:':
                        activeOutput = 0
                    else:
                        buffer = formatName(buffer.lower())
                        if buffer.lower() in loreWords:
                            activeOutput = 0
                        elif buffer.lower() in nameList:
                            activeOutput = 1
                            outputBuffer[activeOutput] = outputBuffer[activeOutput] + "===\n"
                        else:
                            #Need to add splitting stat lines
                            activeOutput = 1
                    outputBuffer[activeOutput] = outputBuffer[activeOutput] + buffer + '\n'
                    buffer = ""
            elif lastProperty == textProperty.Other:
                if checkForProperty(s) != textProperty.Other:
                    outputBuffer[activeOutput] = outputBuffer[activeOutput] + buffer + '\n'
                    buffer = "" 
                
            buffer = buffer + s['text']                
            lastProperty = checkForProperty(s)
#             print(buffer)
#             print(lastProperty)
if buffer != "":
    outputBuffer[activeOutput] = outputBuffer[activeOutput] + buffer + '\n'
    
print("LORE\n" + outputBuffer[0])
print("STATBLOCK\n" + outputBuffer[1])

LORE
Lore
DC 10 Intelligence (History): Aberrant horrors are the results of failed arcane experiments that twist the body into unnatural forms.
DC 15 Intelligence (Arcana): Each aberrant horror is different. Some grow extra limbs, others dissolve into oozelike creatures, and others swell to massive proportions. They contain eldritch energy that can fuel necromancy and transmutation.
GM Advice:
 The arcane origins of aberrant horrors give you a great deal of room to challenge your players with unusual traits. For example, one horror might only take damage when it’s under the effects of a bane or bless spell, but ferreting out this information requires the characters to face it once and succeed on a DC 10 Intelligence (Arcana) check. The secret to pulling this off is to make the odd traits easy to learn and possible to accomplish without overly taxing the characters.

STATBLOCK
7Grim Hollow monster GrimoireA
===
Amorphous Horror
Medium aberration, any alignment
Armor 
Class 14 (natural a