In [None]:
import os
import csv
import pickle
import json
import argparse
import time
import pythoncom
from win32com.client import Dispatch
from tqdm.auto import tqdm
import spacy
import logging
import random
import re

In [None]:
def reverse_index_conversion(x, jump_points):
    addition = 0
    for index in jump_points:
        if x >= index - addition:
            addition += 1
        else:
            return x + addition
    return x + addition

def get_sentence_pairs(path):
    """|
    Extract lists of the original and revised sentences
    for the Word document indicated by the supplied path
    """
    
    myWord.Visible = False
    doc = myWord.Documents.Open(path)
    doc.TrackRevisions = False
    doc.ActiveWindow.View.RevisionsFilter.Markup = 2
    for table in doc.Tables:
        table.Delete()
    doc.Fields.Unlink()
    doc.Save()
    for omath in doc.OMaths:
        omath.Remove()
    
    #Generate sentence spans
    text = doc.Content.Text
    
    if tokenizer == "nltk":
        span_generator = sentence_tokenizer.span_tokenize(text)
        spans = list(span_generator)
    
    elif tokenizer == "spacy":
        nlp_doc = nlp(text)
        spans = [(s.start_char, s.end_char) for s in nlp_doc.sents]
    
    #Calculate jump points in the Doc indices
    macro = myWord.Documents.Open(os.path.abspath("macro.docm"))
    result = myWord.Application.Run("CheckJumps", path)
    jumps = json.loads(result)
    
    #Check for sentences with revised boundaries
    span_ends_string = json.dumps([reverse_index_conversion(e, jumps) for s, e in spans])
    macro = myWord.Documents.Open(os.path.abspath("macro.docm"))
    result = myWord.Application.Run("CheckBoundaryRevisions", 
                                    span_ends_string,
                                    path)
    joins = json.loads(result)
    
    if len(joins) > 0:
        if joins[-1] == len(spans) - 1:
            joins.pop()
    
    #Join sentences with revised boundaries
    for join in joins[::-1]:
        spans = spans[:join] + [(spans[join][0], spans[join+1][1])] + spans[join+2:]

    #Convert sentence spans to Doc indices and extract sentence pairs
    converted_spans = [(reverse_index_conversion(s, jumps), 
                        reverse_index_conversion(e, jumps)) for s, e in spans[::-1]]
    myWord.Visible = False
    macro = myWord.Documents.Open(os.path.abspath("macro.docm"))
    result = myWord.Application.Run("ExtractSentences", 
                                    json.dumps(converted_spans), 
                                    json.dumps(spans[::-1]),
                                    text,
                                    path)
    sentences = json.loads(result)
    
    return sentences, len(spans)

In [None]:
myWord = Dispatch("Word.Application")
tokenizer = "spacy"
nlp = spacy.load("en_core_web_sm")

In [None]:
sentences, total_num_sents = get_sentence_pairs("D:\proofread\O-2016-000061-翻譯105.1.4--edited final___.docx")

In [None]:
print(total_num_sents, len(sentences["OriginalSentences"]))

In [None]:
sentences["OriginalSentences"][200], sentences["RevisedSentences"][200]