In [None]:
import google.generativeai as genai
import docx
import re
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from docx.oxml.ns import qn
from PIL import Image
from io import BytesIO
import os
import ast

In [None]:
def iter_block_items(doc):
    for block in doc.element.body:
        if block.tag.endswith('}p'):
            yield Paragraph(block, doc)
        elif block.tag.endswith('}tbl'):
            yield Table(block, doc)

In [None]:
def has_image(paragraph):
    return any('graphic' in r.tag.lower() for r in paragraph._element.iter())

def extract_images_from_paragraph(paragraph):
    image_bytes_list = []
    for shape in paragraph._element.iter():
        if shape.tag.endswith('}blip'):
            rId = shape.get(qn('r:embed'))
            image_part = paragraph.part.related_parts[rId]
            image_bytes = image_part.blob
            image_bytes_list.append(image_bytes)
    return image_bytes_list

In [None]:
def iter_block_items(doc):
    for block in doc.element.body:
        tag = block.tag.lower()
        if tag.endswith('}p'):
            yield Paragraph(block, doc)
        elif tag.endswith('}tbl'):
            yield Table(block, doc)
        elif tag.endswith('}sectpr'):
            continue  # skip section properties
        else:
            yield block

def has_image(paragraph):
    return any('graphic' in r.tag.lower() for r in paragraph._element.iter())

def extract_images_from_paragraph(paragraph):
    image_bytes_list = []
    for shape in paragraph._element.iter():
        if shape.tag.endswith('}blip'):
            rId = shape.get(qn('r:embed'))
            image_part = paragraph.part.related_parts[rId]
            image_bytes = image_part.blob
            image_bytes_list.append(image_bytes)
    return image_bytes_list

def contains_equation(paragraph):
    for elem in paragraph._element.iter():
        if "oMath" in elem.tag or "oMathPara" in elem.tag:
            return True
    return False

def scan_document():
    global input_path
    input_path = 'documents/input.docx'
    if not os.path.exists(input_path):
        print(f"Input file not found: {input_path}")
        return

    doc = Document(input_path)
    print("Scanning Document...\n")

    image_counter = 1

    for block in iter_block_items(doc):
        if isinstance(block, Paragraph):
            text = block.text.strip()[:60]
            style = block.style.name.lower()

            if contains_equation(block):
                print(f"Equation: {text}")
            elif 'heading' in style:
                match = re.search(r'heading (\d+)', style)
                if match:
                    print(f"Heading {match.group(1)}: {text}")
                else:
                    print(f"Heading (untyped): {text}")
            elif 'caption' in style:
                print(f"Caption: {text}")
            elif text:
                print(f"Paragraph: {text}")
            else:
                print("Empty Paragraph")

            if has_image(block):
                images = extract_images_from_paragraph(block)
                for img_bytes in images:
                    try:
                        img = Image.open(BytesIO(img_bytes))
                        print(f"Image {image_counter} — Format: {img.format}, Size: {img.size}")
                        image_counter += 1
                    except Exception as e:
                        print(f"Failed to load image: {e}")

        elif isinstance(block, Table):
            print("Table")

        else:
            raw_tag = block.tag if hasattr(block, 'tag') else str(type(block))
            print(f"Unknown Block — Tag: {raw_tag}")

In [None]:
scan_document()

In [None]:
def extract_paragraphs(file_path):
    doc = Document(file_path)
    paragraphs = []
    for para in doc.paragraphs:
        style = para.style.name.lower()
        if style in ["normal", "body text"]:  # Only include plain paragraphs
            text = para.text.strip()
            if text:
                paragraphs.append(text)
    return paragraphs

paragraph_list = extract_paragraphs(input_path)

# Preview first few paragraphs
for i, p in enumerate(paragraph_list[:]):
    print(f"Paragraph {i+1}: {p}")

In [None]:
def simple_sentence_split(text):
    # Splits on period, question mark, or exclamation, followed by space and a capital letter
    return re.split(r'(?<=[\.\?\!])\s+(?=[A-Z])', text)

# paragraph_list is assumed to be already defined
all_sentences = []

for paragraph in paragraph_list:
    sentences = simple_sentence_split(paragraph)
    all_sentences.extend(sentences)

print(f"Split into {len(all_sentences)} total sentences.\n")

for idx, sentence in enumerate(all_sentences, start=1):
    print(f"Sentence {idx}: {sentence}"+"\n")


In [None]:
# Set your Gemini API Key manually
GEMINI_API_KEY = "AIzaSyBe5DwsXc49gaHO51YRNCdLhZlAc2v4v7I"  # Replace with your actual API key
genai.configure(api_key=GEMINI_API_KEY)

In [None]:
def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])

# Specify the path directly within the project folder
file_path = os.path.join("", "Academic Research Writing Dos and Don.docx")  # Adjust filename if needed

# Read and store the extracted text
example_rules = read_docx(file_path)

# Print the full extracted content or a preview
print("Extracted content:\n")
print(example_rules)  # Preview first 1000 characters

In [None]:
# Prompt to get regex patterns with descriptions
system_prompt = "You are a regex generator. Given writing rules or examples, output Python regex patterns along with their short descriptions to detect them."

user_prompt = f"""
Here are the writing rules or examples:

{example_rules}

Generate Python regex patterns and a short description for each, output as a Python list of dictionaries in the following format:
[
    {{"pattern": r'pattern_here', "description": "short description here"}},
    ...
]
"""

model = genai.GenerativeModel('gemini-2.0-flash-lite')
response = model.generate_content(f"{system_prompt}\n\n{user_prompt}")

generated_pattern_list = response.text
# Prompt to validate and optimize regex patterns
validator_system_prompt = """
You are a regex validator and optimizer. Given a list of regex patterns and writing rules:
- Improve the regex patterns if needed.
- Ensure all patterns are case-insensitive by adding (?i) at the start of the pattern.
- Only return a valid Python list of dictionaries.
- Do not include explanations, code comments, or markdown formatting.
- Do not assign the list to any variable.

Return strictly this format:

[
    {"pattern": r'regex_here', "description": "short explanation"},
    ...
]
"""
validator_user_prompt = f"""
Writing Rules:

{example_rules}

Generated Regex Patterns:

{generated_pattern_list}

Please review and improve the patterns. Return the updated list in this format:

[
    {{"pattern": r'pattern_here', "description": "short description here"}},
    ...
]

Optionally explain the improvements you made.
"""

# Run Validator Prompt using Gemini
model = genai.GenerativeModel('gemini-2.0-flash-lite')
response = model.generate_content(f"{validator_system_prompt}\n\n{validator_user_prompt}")

validated_pattern_list = response.text
print("\n===  Validated & Improved Regex Pattern List (Gemini) ===\n")
print(validated_pattern_list)

In [None]:
# Extract the part between the first [ and last ]
list_match = re.search(r'(\[.*\])', validated_pattern_list, re.DOTALL)

if list_match:
    clean_list_str = list_match.group(1)
    try:
        parsed_pattern_list = ast.literal_eval(clean_list_str)
        print(" Successfully parsed pattern list.")
    except Exception as e:
        print(" Failed to parse pattern list:", e)
        parsed_pattern_list = []
else:
    print(" No valid list found in generated content.")
    parsed_pattern_list = []

In [None]:
def analyze_sentence(sentence, pattern_list):
    SYSTEM_PROMPT = """You are a professional text editor that follows strict formatting rules. Always:
1. Analyze the sentence for pattern matches using these rules:
{patterns_list}
2. List each pattern matched with example matches.
3. Specify required changes.

Respond in this exact structure:
[Analysis]
- List each pattern matched with example matches
- Specify required changes
"""

    formatted_rules = "\n".join([f"- {p['description']} (Pattern: {p['pattern']})" for p in pattern_list])
    full_prompt = f"{SYSTEM_PROMPT.format(patterns_list=formatted_rules)}\n\nOriginal sentence:\n{sentence}\n"

    try:
        model = genai.GenerativeModel('gemini-2.0-flash-lite')
        response = model.generate_content(full_prompt)
        response_text = response.text

        # Extract analysis only
        analysis = re.search(r'\[Analysis\](.*)', response_text, re.DOTALL)
        analysis_text = analysis.group(1).strip() if analysis else response_text.strip()

        return {
            "original": sentence,
            "analysis": analysis_text
        }

    except Exception as e:
        print(f"LLM error: {e}")
        return {
            "original": sentence,
            "analysis": "Error during LLM analysis."
        }

# --- Main workflow ---
results = []
for sentence in all_sentences:
    result = analyze_sentence(sentence, parsed_pattern_list)
    results.append(result)

# Print only the analysis for each sentence
print("====Analysis for Each Sentence====")
for r in results:
    print(f"Original: {r['original']}")
    print(f"Analysis: {r['analysis']}\n")


In [None]:
def rewrite_sentence(sentence, pattern_list):
    SYSTEM_PROMPT = """You are a professional text editor that follows strict formatting rules.
Rewrite the sentence to avoid ALL pattern matches according to these rules:
{patterns_list}
- Maintain original formatting where possible.
- Preserve the original meaning.
- validiate the rewritten output again with regex pattern

Respond with ONLY the rewritten sentence and nothing else."""

    formatted_rules = "\n".join([f"- {p['description']} (Pattern: {p['pattern']})" for p in pattern_list])
    full_prompt = SYSTEM_PROMPT.format(patterns_list=formatted_rules) + f"\n\nOriginal sentence:\n{sentence}\n"

    try:
        model = genai.GenerativeModel('gemini-2.0-flash-lite')
        response = model.generate_content(full_prompt)
        rewritten_sentence = response.text.strip()
        return {sentence: rewritten_sentence}
    except Exception as e:
        print(f"LLM error: {e}")
        return {sentence: sentence}

# --- Main workflow ---
rewritten_sentences = {}
for sentence in sentences:
    result = rewrite_sentence(sentence, parsed_pattern_list)
    rewritten_sentences.update(result)

print("====Original Text====")
for sentence in rewritten_sentences.keys():
    print(sentence)
print("====Rewritten sentences Text====")
for sentence in rewritten_sentences.values():
    print(sentence)