In [5]:
from docx import Document
import pandas as pd
import re
from pathlib import Path
import sys





In [6]:
# Add project root to sys.path so you can import path_config regardless of notebook location
sys.path.append(str(Path.cwd().parents[2]))  # adjust depth as needed if notebook is deep in folders

try:
    from path_config import project_path, project_papers_path, project_data_exports_path
except ImportError:
    raise ImportError("Could not import path_config. Ensure path_config.py exists in the project root and sys.path is correctly set.")

# Print paths for debugging
print(f'project_path: {project_path}')
print(f'project_papers_path: {project_papers_path}')
print(f'project_data_exports_path: {project_data_exports_path}')

project_path: /Users/chrisizenour/Library/CloudStorage/Dropbox/python/projects/football
project_papers_path: /Users/chrisizenour/Library/CloudStorage/Dropbox/python/projects/football/papers
project_data_exports_path: /Users/chrisizenour/Library/CloudStorage/Dropbox/python/projects/football/data/exports


In [7]:
def docx_to_markdown(docx_path, output_md_path):
    """
    Convert a .docx file to Markdown, handling headings, paragraphs, and tables.

    Args:
        docx_path (Path or str): Path to the input .docx file.
        output_md_path (Path or str): Path to save the output .md file.
    """
    # Convert paths to Path objects for robust handling
    docx_path = Path(docx_path)
    output_md_path = Path(output_md_path)

    # Check if input file exists
    if not docx_path.exists():
        raise FileNotFoundError(f"Input file not found: {docx_path}")

    # Ensure output directory exists
    output_md_path.parent.mkdir(parents=True, exist_ok=True)

    # Load the .docx file
    doc = Document(docx_path)

    # Initialize Markdown content
    markdown = []

    # Process each paragraph
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue

        # Handle headings based on style
        style = para.style.name.lower()
        if 'heading 1' in style:
            markdown.append(f"# {text}")
        elif 'heading 2' in style:
            markdown.append(f"## {text}")
        elif 'heading 3' in style:
            markdown.append(f"### {text}")
        else:
            # Handle italicized study titles (e.g., *Leeds and Kowalewski (2001)*)
            if text.startswith('*') and text.endswith('*'):
                markdown.append(text)
            else:
                markdown.append(text)

        # Add a blank line after each paragraph for Markdown readability
        markdown.append("")

    # Process tables (e.g., Table 1)
    for table in doc.tables:
        # Extract table data
        table_data = []
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            table_data.append(row_data)

        # Convert to pandas DataFrame for easier Markdown table creation
        df = pd.DataFrame(table_data[1:], columns=table_data[0])

        # Create Markdown table
        markdown.append("**Table 1: Summary of Cited Studies’ Characteristics and Limitations**")
        markdown.append("")
        markdown.append(df.to_markdown(index=False))
        markdown.append("")

    # Save to Markdown file
    with open(output_md_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(markdown))

    print(f"Markdown file saved to {output_md_path}")

In [8]:
# Example usage with path_config
docx_filename = "nfl_salary_cap_literature_review_20250511.docx"
output_md_filename = "literature_review.md"

# Construct paths using path_config
docx_path = Path(project_papers_path) / docx_filename
output_md_path = Path(project_papers_path) / output_md_filename

# Run conversion
try:
    docx_to_markdown(docx_path, output_md_path)
except Exception as e:
    print(f"Error during conversion: {e}")

Markdown file saved to /Users/chrisizenour/Library/CloudStorage/Dropbox/python/projects/football/papers/literature_review.md
