In [7]:
# This script allows the user to extract all tracked changes to a new Word file. 
# Input file: a DOCX file with tracked changes
# Output file: a DOCX file ONLY with the clean text from tracked changes from the input file 

In [None]:
pip install python-docx

In [None]:
import zipfile
from lxml import etree
from docx import Document

In [2]:
def extract_insertions_from_docx(docx_path):
    insertions = []
    # Open the DOCX file as a ZIP archive
    with zipfile.ZipFile(docx_path, 'r') as docx:
        # Read the main document part XML
        with docx.open('word/document.xml') as document_xml:
            xml_content = document_xml.read()
            tree = etree.XML(xml_content)
            # Namespace dictionary to handle XML namespaces
            namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
            # Find all <w:ins> elements in the document
            for ins in tree.findall('.//w:ins', namespaces):
                # Extract the text contained in the <w:ins> element
                ins_text = ''.join(ins.itertext())
                if ins_text.strip():  # Only capture non-empty insertions
                    insertions.append(ins_text)
                    print(f"Found insertion: {ins_text}")  # Debug statement

    return insertions

In [None]:
def save_insertions_to_docx(insertions, output_path):
    # Create a new Document
    new_doc = Document()
    # Add each insertion as a new paragraph
    for insertion in insertions:
        new_doc.add_paragraph(insertion)
    # Save the new document
    new_doc.save(output_path)

# Path to your DOCX file
doc_path = 'C:/Users/User/uploaded_file.docx'
insertions = extract_insertions_from_docx(doc_path)

# Path to the output DOCX file
output_doc_path = 'insertions_only.docx'
save_insertions_to_docx(insertions, output_doc_path)

print(f"Insertions have been extracted and saved to '{output_doc_path}'")