In [1]:
import google.generativeai as genai
import os
import docx
import re
import ast

In [2]:
def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])

file_path = os.path.join("rules", "Academic Research Writing Dos and Don.docx")

example_rules = read_docx(file_path)
print(example_rules)

Academic Research Writing Dos and Don’ts
Rules to Follow
General:
Do NOT use passive voice, pronouns, adverbs, or emotive language.
Never begin a sentence with an acronym.
Don’t use the same word twice in a sentence.
Do NOT use prepositional phrases {PP} (or passive words) to start/begin a sentence.
Do not write in first- or second person; always write in third person.
Do NOT begin sentences using the words : this, these, in, that, with, a, an, and, it, is, its, it is (both upper case and lower case applied)
Maximum sentence length is to be no longer than 22 words. Remove excessive words more than 22 words but still should have meaning
Spell out numbers less than 10 unless associated with the body of your work.
Don’t start sentence using a number.
description": "Use 'a' or 'an' only based on the following word's pronunciation (e.g., 'an ethical', 'a useful tool')",
"pattern": "\\b(a|an)\\b
ACADEMIC RESEARCH WRITING EXPECTATIONS (ARWE)
It includes both upper and lower case and combinati

In [3]:
genai.configure(api_key="AIzaSyAKVcYFqUH-SIWpsiz127xE3cZmmgNgG7Y")
model = genai.GenerativeModel('gemini-1.5-flash-latest', generation_config={"temperature": 0.0})

In [4]:
# Prompt to get regex patterns with descriptions
system_prompt = "You are a regex generator. Given writing rules or examples, output Python regex patterns along with their short descriptions to detect them."

user_prompt = f"""
Here are the writing rules or examples:

{example_rules}

Generate Python regex patterns and a short description for each, output as a Python list of dictionaries in the following format:
[
    {{"pattern": r'pattern_here', "description": "short description here"}},
    ...
]
"""

model = genai.GenerativeModel('gemini-2.0-flash-lite')
response = model.generate_content(f"{system_prompt}\n\n{user_prompt}")

generated_pattern_list = response.text
# Prompt to validate and optimize regex patterns
validator_system_prompt = """
You are a regex validator and optimizer. Given a list of regex patterns and writing rules:
- Improve the regex patterns if needed.
- Ensure all patterns are case-insensitive by adding (?i) at the start of the pattern.
- Only return a valid Python list of dictionaries.
- Do not include explanations, code comments, or markdown formatting.
- Do not assign the list to any variable.

Return strictly this format:

[
    {"pattern": r'regex_here', "description": "short explanation"},
    ...
]
"""
validator_user_prompt = f"""
Writing Rules:

{example_rules}

Generated Regex Patterns:

{generated_pattern_list}

Please review and improve the patterns. Return the updated list in this format:

[
    {{"pattern": r'pattern_here', "description": "short description here"}},
    ...
]

Optionally explain the improvements you made.
"""

model = genai.GenerativeModel('gemini-1.5-flash-latest', generation_config={"temperature": 0.0})
response = model.generate_content(f"{validator_system_prompt}\n\n{validator_user_prompt}")

validated_pattern_list = response.text
print(validated_pattern_list)

```python
[
    {"pattern": r"(?i)\b(a|an)\b\s+\w+", "description": "Detects 'a' or 'an' followed by a word"},
    {"pattern": r"(?i)^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:\w+\s+){0,20}\w+", "description": "Checks if a sentence starts with an acronym"},
    {"pattern": r"(?i)\b(\w+)\s+\1\b", "description": "Detects repeated words within a sentence"},
    {"pattern": r"(?i)^(?:(?:(?:in|with|a|an|and|it|is|its|this|these|that)\s+)|(?:in|with|a|an|and|it|is|its|this|these|that)\b).*", "description": "Detects sentences starting with prohibited words"},
    {"pattern": r"(?i)^\s*(?:(?:this|these|in|that|with|a|an|and|it|is|its)\b|it is)\s+.*", "description": "Detects sentences starting with prohibited words"},
    {"pattern": r"(?i)\b(?:one|two|three|four|five|six|seven|eight|nine)\b", "description": "Detects numbers less than 10 (spelled out)"},
    {"pattern": r"(?i)^\d", "description": "Detects sentences starting with a number"},
    {"pattern": r"(?i)\b(Additionally|Many|Some|Better|Best|

In [5]:
# Extract the part between the first [ and last ]
list_match = re.search(r'(\[.*\])', validated_pattern_list, re.DOTALL)

if list_match:
    clean_list_str = list_match.group(1)
    try:
        parsed_pattern_list = ast.literal_eval(clean_list_str)
        print(" Successfully parsed pattern list.")
    except Exception as e:
        print(" Failed to parse pattern list:", e)
        parsed_pattern_list = []
else:
    print(" No valid list found in generated content.")
    parsed_pattern_list = []

 Successfully parsed pattern list.


In [9]:
for i in parsed_pattern_list:
    print(i)
    print()

{'pattern': '(?i)\\b(a|an)\\b\\s+\\w+', 'description': "Detects 'a' or 'an' followed by a word"}

{'pattern': '(?i)^[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)*\\s+(?:\\w+\\s+){0,20}\\w+', 'description': 'Checks if a sentence starts with an acronym'}

{'pattern': '(?i)\\b(\\w+)\\s+\\1\\b', 'description': 'Detects repeated words within a sentence'}

{'pattern': '(?i)^(?:(?:(?:in|with|a|an|and|it|is|its|this|these|that)\\s+)|(?:in|with|a|an|and|it|is|its|this|these|that)\\b).*', 'description': 'Detects sentences starting with prohibited words'}

{'pattern': '(?i)^\\s*(?:(?:this|these|in|that|with|a|an|and|it|is|its)\\b|it is)\\s+.*', 'description': 'Detects sentences starting with prohibited words'}

{'pattern': '(?i)\\b(?:one|two|three|four|five|six|seven|eight|nine)\\b', 'description': 'Detects numbers less than 10 (spelled out)'}

{'pattern': '(?i)^\\d', 'description': 'Detects sentences starting with a number'}

{'pattern': '(?i)\\b(Additionally|Many|Some|Better|Best|Ah|May|Almost|Difficult|Might

In [10]:
import json

with open("parsed_pattern_list.json", "w") as f:
    json.dump(parsed_pattern_list, f, indent=2)