In [None]:
from zipfile import ZipFile
from lxml import etree

# Change the names of the files to match your setup
FILE_TO_READ = "p1-transcript-gr.docx"
FILE_TO_WRITE = "p1-comments.md"

def extract_comments_with_context(docx_path):
    comments = {}

    with ZipFile(docx_path) as docx:
        # 1. Load comments.xml
        comments_xml = docx.read("word/comments.xml")
        comments_tree = etree.XML(comments_xml)

        for comment in comments_tree.findall(".//w:comment", namespaces={"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}):
            cid = comment.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
            comments[cid] = {
                "author": comment.get("author"),
                "date": comment.get("date"),
                "text": "".join(comment.itertext()).strip(),
                "commented_text": ""
            }

        # 2. Load document.xml to find the commented text
        document_xml = docx.read("word/document.xml")
        document_tree = etree.XML(document_xml)

        current_comment_id = None
        collected_text = []

        for elem in document_tree.iter():
            # Start of commented text
            if elem.tag.endswith("commentRangeStart"):
                current_comment_id = elem.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id")
                collected_text = []
            # Collect text while inside a comment range
            elif current_comment_id and elem.tag.endswith("t"):
                collected_text.append(elem.text)
            # End of commented text
            elif elem.tag.endswith("commentRangeEnd") and current_comment_id:
                if current_comment_id in comments:
                    comments[current_comment_id]["commented_text"] = "".join(collected_text).strip()
                current_comment_id = None

    return list(comments.values())

# Example usage
comments = extract_comments_with_context(FILE_TO_READ)

#save to file as md table
with open(FILE_TO_WRITE, "w") as f:
    f.write("| Commented Text | Code |\n")
    f.write("|----------------|------|\n")
    for c in comments:
        f.write(f"| {c['commented_text']} | {c['text']} |\n")

print('text, code')
for i, c in enumerate(comments, 1):
    print(f"''{c['commented_text']}'', {c['text']}")

text, code
''I mostly write science fiction and fantasy stuff'', context: subject matter?
''I played a lot of music very seriously'', context: skills, background
''I depend on it now, but I'm really employed to teach'', context: (partial) source of income
''start from an interesting idea'', process: inspiration
''I try to do is spin that idea out. A character suggests themself or a scenario, a situation, and that builds outward'', process: drafting
''The novel's been hard, because I haven't felt it follow quite as straightforwardly as I feel like the short fiction sometimes does'', process: frictions
''It tends to take place largely within the confines of a document, like a Google doc or a Word document'', process: tools
''I try to keep most of it contained in a single document space'', process: structure?
''I'm often taking notes by hand when I'm stuck. Lately I've been planning scenes in a physical notebook--using all those prompts to figure out what could happen with this novel'', p