Skip to content

Commit

Permalink
resolve hypen issue on a basic level
Browse files Browse the repository at this point in the history
  • Loading branch information
emareg committed Jul 29, 2020
1 parent 17d0f72 commit d839993
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 47 deletions.
46 changes: 5 additions & 41 deletions papercheck/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,45 +85,7 @@
######################################################################################


def readInputFile(fileName):
import os

ext = fileName.lower().split(".")[-1]
fileName = os.path.expanduser(fileName)
inFileHandler = open(fileName, "rb")

if ext == "pdf":
import subprocess

SCRIPT_DIR = os.getcwd()

if Path(fileName).is_absolute():
fileName = Path(os.path.relpath(Path(fileName), SCRIPT_DIR))

args = [
"pdftotext",
"-enc",
"UTF-8",
"{}/{}".format(SCRIPT_DIR, fileName),
"-",
]
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
text = res.stdout.decode("utf-8")
text = re.sub(r"(?<=\n)\w\w?(?=\n)", "", text) # remove lines with single word
text = re.sub(r"\f", "", text) # remove page breaks
text = re.sub(r"fi", "fi", text) # fi Ligature fi
text = re.sub(r"ff", "ff", text) # ff Ligature ff

elif ext in ["txt", "tex", "md"]:
text = inFileHandler.read().decode("utf-8")
inFileHandler.close()

else:
raise ValueError("unknown extension: " + ext)

text = re.sub(r"\s(\w{2:7})-\n(\w{2:7})\s", r" \1\2\n", text) # resolve hyphen

return text


def writeOutputFile(fileName, text):
Expand Down Expand Up @@ -252,8 +214,9 @@ def createHTMLreport(lines, linenums=[[], [], []], stats=""):

def parseFile(fileName, args):
fileBaseName, ext = os.path.splitext(fileName)
fileBaseName = os.path.basename(fileBaseName)

text = readInputFile(fileName)
text = readTextFromFile(fileName)

global outputLines
global G_filename
Expand Down Expand Up @@ -313,8 +276,9 @@ def parseFile(fileName, args):
output = createHTMLreport(
outputLines, [grammar_linenums, style_linenums, spell_linenums], stats
)
# with open(fileBaseName+'_check_report.html', "w+") as f:
with open(Path(args.filename).absolute(), "w+") as f:
with open(fileBaseName+'_papercheck.html', "w+") as f:
print(fileBaseName+'_papercheck.html')
# with open(Path(args.filename).absolute(), "w+") as f:
f.write(output)


Expand Down
76 changes: 70 additions & 6 deletions papercheck/lib/stripper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@


import re
import os
from pathlib import Path


def readTextFromFile(fileName):
import os

ext = fileName.lower().split(".")[-1]
fileName = os.path.expanduser(fileName)
Expand All @@ -28,23 +29,86 @@ def readTextFromFile(fileName):
]
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
text = res.stdout.decode("utf-8")
text = re.sub(r"(?<=\n)\w\w?(?=\n)", "", text) # remove lines with single word
text = re.sub(r"\f", "", text) # remove page breaks
text = re.sub(r"fi", "fi", text) # fi Ligature fi
text = re.sub(r"ff", "ff", text) # ff Ligature ff
text = stripPDFtoText(text)

elif ext in ["txt", "tex", "md"]:
text = inFileHandler.read().decode("utf-8")
inFileHandler.close()

elif ext == "html":
text = inFileHandler.read().decode("utf-8")
text = stripHTML(text)

else:
raise ValueError("unknown extension: " + ext)

text = re.sub(r"\s(\w{2:7})-\n(\w{2:7})\s", r" \1\2\n", text) # resolve hyphen
text = resolveHyphen(text)

return text



def resolveHyphen(text):
# note: pdftotext already removes hyphens
# undo this: 1. figure out average line length, check if a line almost double the length, check if word is misspelled.
matches = re.findall(r"(\s(\w{2,7})-\n(\w{2,7})\s)", text)
for match in matches:
if match[1] != "self" and match[2] not in ["based", "constrained", "defined", "case", "related", "critical", "level"]:
text = text.replace(match[0], match[1]+match[2]+"\n")
return text



def stripPDFtoText(text):
text = re.sub(r"(?<=\n)\s?\w\w*?\s?(?=\n)", "", text) # remove lines with single word
text = re.sub(r"(?<=\n)(?!\d\.\s|\d.\d\d?\.|[A-Z]).{,12}(?=\n)", "", text) # remove short lines that are not headings
text = re.sub(r"\n{4,}", "\n\n\n", text) # remove empty parts
text = re.sub(r"\f", "", text) # remove page breaks
text = re.sub(r"fi", "fi", text) # fi Ligature fi
text = re.sub(r"ff", "ff", text) # ff Ligature ff
text = text.replace("\f", "") # pagebreaks

lines = text.splitlines(True)
n = len(lines)
s = sorted([len(x) for x in lines])
medlen = int((sum(s[n//2-1:n//2+1])/2.0, s[n//2])[n % 2]) if n else None
minl = int(medlen*0.85)
print("medlen:", medlen)

## find lines longer
for idx,line in enumerate(lines):
if len(line) > 1.6*medlen:
line = re.sub(rf"(.{{{minl},{medlen}}}\s\w{{4,10}})(based|constrained|defined|case|related|critical|level)", r"\1-\n\2", line)
line = re.sub(rf"([^\n]{{{minl},{medlen}}}\s(?:self))(\w{{3,10}})", r"\1-\n\2", line)
line = re.sub(rf"([^\n]{{{minl+5},{medlen+5}}})\s([^\n]{{{minl-10},}})", r"\1\n\2", line)
# print(line)
lines[idx] = line
text = ''.join(lines)
return text



# is this necessary? Will produce nice output. Maybe only insert spaces before hX and <p>HERE Start of a Sentence</p>
# provide line-offset?

def stripHTML(text):
text = re.sub(r"^(?:.|\n)*?<body.*?>", "", text)
text = re.sub(r"</body>(?:.|\n)*", "", text)

# headings, paragraphs, strong, em
text = re.sub(r'<h\d[^>]*>((?:(?!</h\d>).)*?)</h\d>', r'\1', text) # keep headings and lists?
text = re.sub(r'<p[^>]*>((?:(?!</p>).)*?)</p>', r'\1', text)
text = re.sub(r'<strong[^>]*>((?:(?!</strong>).)*?)</strong>', r'\1', text)
text = re.sub(r'<code[^>]*>((?:(?!</code>).)*?)</code>', r'\1', text)
text = re.sub(r'<em[^>]*>((?:(?!</em>).)*?)</em>', r'\1', text)

text = text.replace('&nbsp;', '')


return text



def stripTeX(text, preserveLines=False):
linenum = 1
if preserveLines and "\\begin{" + "document}" in text:
Expand Down

0 comments on commit d839993

Please sign in to comment.