Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add features of multi-cpu processing #40

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 51 additions & 35 deletions errant/commands/parallel_to_m2.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,24 @@
import argparse
from contextlib import ExitStack
import errant
from multiprocessing import Pool
from tqdm import tqdm

def main():
# Parse command line args
args = parse_args()
print("Loading resources...")
# Load Errant
annotator = errant.load("en")
print("Loading resources...")
# Load Errant
annotator = errant.load("en")

def main():
print("Processing parallel files...")
# Process an arbitrary number of files line by line simultaneously. Python 3.3+
# See https://tinyurl.com/y4cj4gth . Also opens the output m2 file.
with ExitStack() as stack, open(args.out, "w") as out_m2:
in_files = [stack.enter_context(open(i)) for i in [args.orig]+args.cor]
# Process each line of all input files
for line in zip(*in_files):
# Get the original and all the corrected texts
orig = line[0].strip()
cors = line[1:]
# Skip the line if orig is empty
if not orig: continue
# Parse orig with spacy
orig = annotator.parse(orig, args.tok)
# Write orig to the output m2 file
out_m2.write(" ".join(["S"]+[token.text for token in orig])+"\n")
# Loop through the corrected texts
for cor_id, cor in enumerate(cors):
cor = cor.strip()
# If the texts are the same, write a noop edit
if orig.text.strip() == cor:
out_m2.write(noop_edit(cor_id)+"\n")
# Otherwise, do extra processing
else:
# Parse cor with spacy
cor = annotator.parse(cor, args.tok)
# Align the texts and extract and classify the edits
edits = annotator.annotate(orig, cor, args.lev, args.merge)
# Loop through the edits
for edit in edits:
# Write the edit to the output m2 file
out_m2.write(edit.to_m2(cor_id)+"\n")
# Write a newline when we have processed all corrections for each line
out_m2.write("\n")
with Pool(args.worker) as pool:
for res in pool.imap(extract_edits, tqdm(zip(*in_files)), chunksize=512):
if res:
out_m2.write(res)

# Parse command line args
def parse_args():
Expand Down Expand Up @@ -81,10 +57,50 @@ def parse_args():
"all-equal: Merge adjacent same-type non-matches: MSSDI -> M, SS, D, I",
choices=["rules", "all-split", "all-merge", "all-equal"],
default="rules")
parser.add_argument(
"-worker",
help="The number of multi-processing workers.",
type=int,
default=16,
)
args=parser.parse_args()
return args

# Parse command line args
args = parse_args()

def extract_edits(line):
res = ""
# Get the original and all the corrected texts
orig = line[0].strip()
cors = line[1:]
# Skip the line if orig is empty
if not orig: return ""
# Parse orig with spacy
orig = annotator.parse(orig, args.tok)
# Write orig to the output m2 file
res += " ".join(["S"]+[token.text for token in orig])+"\n"
# Loop through the corrected texts
for cor_id, cor in enumerate(cors):
cor = cor.strip()
# If the texts are the same, write a noop edit
if orig.text.strip() == cor:
res += noop_edit(cor_id)+"\n"
# Otherwise, do extra processing
else:
# Parse cor with spacy
cor = annotator.parse(cor, args.tok)
# Align the texts and extract and classify the edits
edits = annotator.annotate(orig, cor, args.lev, args.merge)
# Loop through the edits
for edit in edits:
# Write the edit to the output m2 file
res += edit.to_m2(cor_id)+"\n"
# Write a newline when we have processed all corrections for each line
res += "\n"
return res

# Input: A coder id
# Output: A noop edit; i.e. text contains no edits
def noop_edit(id=0):
return "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(id)
return "A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||"+str(id)