resolve hypen issue on a basic level

emareg · Jul 29, 2020 · d839993 · d839993
1 parent 17d0f72
commit d839993
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 47 deletions.
diff --git a/papercheck/__main__.py b/papercheck/__main__.py
@@ -85,45 +85,7 @@
 ######################################################################################
 
 
-def readInputFile(fileName):
-    import os
 
-    ext = fileName.lower().split(".")[-1]
-    fileName = os.path.expanduser(fileName)
-    inFileHandler = open(fileName, "rb")
-
-    if ext == "pdf":
-        import subprocess
-
-        SCRIPT_DIR = os.getcwd()
-
-        if Path(fileName).is_absolute():
-            fileName = Path(os.path.relpath(Path(fileName), SCRIPT_DIR))
-
-        args = [
-            "pdftotext",
-            "-enc",
-            "UTF-8",
-            "{}/{}".format(SCRIPT_DIR, fileName),
-            "-",
-        ]
-        res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        text = res.stdout.decode("utf-8")
-        text = re.sub(r"(?<=\n)\w\w?(?=\n)", "", text)  # remove lines with single word
-        text = re.sub(r"\f", "", text)  # remove page breaks
-        text = re.sub(r"ﬁ", "fi", text)  # fi Ligature ﬁ
-        text = re.sub(r"ﬀ", "ff", text)  # ff Ligature ﬀ
-
-    elif ext in ["txt", "tex", "md"]:
-        text = inFileHandler.read().decode("utf-8")
-        inFileHandler.close()
-
-    else:
-        raise ValueError("unknown extension: " + ext)
-
-    text = re.sub(r"\s(\w{2:7})-\n(\w{2:7})\s", r" \1\2\n", text)  # resolve hyphen
-
-    return text
 
 
 def writeOutputFile(fileName, text):
@@ -252,8 +214,9 @@ def createHTMLreport(lines, linenums=[[], [], []], stats=""):
 
 def parseFile(fileName, args):
     fileBaseName, ext = os.path.splitext(fileName)
+    fileBaseName = os.path.basename(fileBaseName)
 
-    text = readInputFile(fileName)
+    text = readTextFromFile(fileName)
 
     global outputLines
     global G_filename
@@ -313,8 +276,9 @@ def parseFile(fileName, args):
         output = createHTMLreport(
             outputLines, [grammar_linenums, style_linenums, spell_linenums], stats
         )
-        # with open(fileBaseName+'_check_report.html', "w+") as f:
-        with open(Path(args.filename).absolute(), "w+") as f:
+        with open(fileBaseName+'_papercheck.html', "w+") as f:
+            print(fileBaseName+'_papercheck.html')
+        # with open(Path(args.filename).absolute(), "w+") as f:
             f.write(output)
 
 

diff --git a/papercheck/lib/stripper.py b/papercheck/lib/stripper.py
@@ -2,10 +2,11 @@
 
 
 import re
+import os
+from pathlib import Path
 
 
 def readTextFromFile(fileName):
-    import os
 
     ext = fileName.lower().split(".")[-1]
     fileName = os.path.expanduser(fileName)
@@ -28,23 +29,86 @@ def readTextFromFile(fileName):
         ]
         res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         text = res.stdout.decode("utf-8")
-        text = re.sub(r"(?<=\n)\w\w?(?=\n)", "", text)  # remove lines with single word
-        text = re.sub(r"\f", "", text)  # remove page breaks
-        text = re.sub(r"ﬁ", "fi", text)  # fi Ligature ﬁ
-        text = re.sub(r"ﬀ", "ff", text)  # ff Ligature ﬀ
+        text = stripPDFtoText(text)
 
     elif ext in ["txt", "tex", "md"]:
         text = inFileHandler.read().decode("utf-8")
         inFileHandler.close()
 
+    elif ext == "html":
+        text = inFileHandler.read().decode("utf-8")
+        text = stripHTML(text)
+
     else:
         raise ValueError("unknown extension: " + ext)
 
-    text = re.sub(r"\s(\w{2:7})-\n(\w{2:7})\s", r" \1\2\n", text)  # resolve hyphen
+    text = resolveHyphen(text)
+
+    return text
+
+
+
+def resolveHyphen(text):
+    # note: pdftotext already removes hyphens
+    #       undo this: 1. figure out average line length, check if a line almost double the length, check if word is misspelled.
+    matches = re.findall(r"(\s(\w{2,7})-\n(\w{2,7})\s)", text)
+    for match in matches:
+        if match[1] != "self" and match[2] not in ["based", "constrained", "defined", "case", "related", "critical", "level"]:
+            text = text.replace(match[0], match[1]+match[2]+"\n")
+    return text
+
+
 
+def stripPDFtoText(text):
+    text = re.sub(r"(?<=\n)\s?\w\w*?\s?(?=\n)", "", text)  # remove lines with single word
+    text = re.sub(r"(?<=\n)(?!\d\.\s|\d.\d\d?\.|[A-Z]).{,12}(?=\n)", "", text)  # remove short lines that are not headings
+    text = re.sub(r"\n{4,}", "\n\n\n", text)  # remove empty parts
+    text = re.sub(r"\f", "", text)  # remove page breaks
+    text = re.sub(r"ﬁ", "fi", text)  # fi Ligature ﬁ
+    text = re.sub(r"ﬀ", "ff", text)  # ff Ligature ﬀ
+    text = text.replace("\f", "")  # pagebreaks
+
+    lines = text.splitlines(True)
+    n = len(lines)
+    s = sorted([len(x) for x in lines])
+    medlen = int((sum(s[n//2-1:n//2+1])/2.0, s[n//2])[n % 2]) if n else None
+    minl = int(medlen*0.85)
+    print("medlen:", medlen)
+
+    ## find lines longer
+    for idx,line in enumerate(lines):
+        if len(line) > 1.6*medlen:
+            line = re.sub(rf"(.{{{minl},{medlen}}}\s\w{{4,10}})(based|constrained|defined|case|related|critical|level)", r"\1-\n\2", line)
+            line = re.sub(rf"([^\n]{{{minl},{medlen}}}\s(?:self))(\w{{3,10}})", r"\1-\n\2", line)
+            line = re.sub(rf"([^\n]{{{minl+5},{medlen+5}}})\s([^\n]{{{minl-10},}})", r"\1\n\2", line)
+            # print(line)
+            lines[idx] = line
+    text = ''.join(lines)
     return text
 
 
+
+# is this necessary? Will produce nice output. Maybe only insert spaces before hX and <p>HERE Start of a Sentence</p>
+# provide line-offset?
+
+def stripHTML(text):
+    text = re.sub(r"^(?:.|\n)*?<body.*?>", "", text)
+    text = re.sub(r"</body>(?:.|\n)*", "", text)
+
+    # headings, paragraphs, strong, em
+    text = re.sub(r'<h\d[^>]*>((?:(?!</h\d>).)*?)</h\d>', r'\1', text)  # keep headings and lists?
+    text = re.sub(r'<p[^>]*>((?:(?!</p>).)*?)</p>', r'\1', text)
+    text = re.sub(r'<strong[^>]*>((?:(?!</strong>).)*?)</strong>', r'\1', text)
+    text = re.sub(r'<code[^>]*>((?:(?!</code>).)*?)</code>', r'\1', text)
+    text = re.sub(r'<em[^>]*>((?:(?!</em>).)*?)</em>', r'\1', text)
+
+    text = text.replace('&nbsp;', '')
+
+
+    return text
+
+
+
 def stripTeX(text, preserveLines=False):
     linenum = 1
     if preserveLines and "\\begin{" + "document}" in text: