In [1]:
from docx import Document
import re
import time
import random
import json
import os
import zipfile
import shutil
from lxml import etree
from docx.oxml.shared import OxmlElement
from docx.oxml import ns
from docx.oxml.ns import qn

In [2]:
input_path = "input/input.docx"
temp_path = "temp.docx"
output_path = "output_file.docx"

In [3]:
shutil.copy(input_path, temp_path)
doc = Document(input_path)
temp_doc = Document(temp_path)

In [4]:
REFERENCES_PATTERN = re.compile(
    r"^(list\s+of\s+)?(references|bibliography|works\s+cited)\s*:?\s*$", re.IGNORECASE
)

EXCLUDE_PREFIXES = re.compile(r"^(name|student name|date|course|section|professor|instructor)[\s:]", re.IGNORECASE)

QUESTION_KEYWORDS = [
    "what", "why", "how", "when", "where", "which", "who", "whom", "whose", "is", "are", "can", "could", "should",
    "would", "will", "do", "does", "did", "explain", "define", "describe", "elaborate", "compare", "contrast",
    "discuss", "illustrate", "analyze", "interpret", "evaluate", "examine", "assess", "outline", "summarize",
    "state", "list", "enumerate", "identify", "classify", "relate", "demonstrate", "justify", "argue", "prove",
    "predict", "formulate", "construct", "design", "develop", "support", "review", "trace", "distinguish",
    "differentiate", "explore", "investigate", "highlight", "indicate", "determine", "specify", "reason",
    "elucidate", "delve", "clarify", "reveal", "uncover", "break down", "find", "guess", "present", "recommend",
    "suggest", "mention", "focus", "restate", "note", "point out", "consider", "reflect", "recall", "name",
    "show", "solve", "address", "assume", "depict", "respond", "resolve", "decide", "conclude", "extend",
    "simplify", "transform", "probe", "look into", "throw light on", "if", "how far", "to what extent",
    "in what ways", "examine how", "examine why", "analyze how", "assess whether", "critically evaluate",
    "debate whether", "explain whether", "why do you think", "what do you think", "is it possible", 
    "should we", "how would you", "can we say", "would you agree", "what are the reasons", 
    "how does", "why does", "what happens", "if you were", "can you explain", "why is it important",
    "do you believe", "why might", "what could", "what would happen", "how important", 
    "if so", "if not", "is that true", "how can", "what impact", "what role", "how much", 
    "why should", "should it be", "in your opinion", "what evidence", "how would", 
    "what kind", "which factors", "how effective", "how likely", "is there any", "does it make sense"
]

QUESTION_REGEX_PATTERN = re.compile(
    r"^(\d+[\).]?\s*)?((Q[\.:)]?)?\s*)?(" + "|".join(re.escape(k) for k in QUESTION_KEYWORDS) + r")\b.+", re.IGNORECASE
)

In [5]:
if os.path.exists(output_path):
    os.remove(output_path)
    
#this function removes empty lines
for para in list(temp_doc.paragraphs):  
    if not para.text.strip():
        para._element.getparent().remove(para._element)

    
#this code filters list of reference out of doc
def is_references_heading(text):
    return bool(re.match(r'(?i)^\s*(list\s*of\s*)?(references?|refs?)\b.*$', text.strip()))
cut_index = None
for i, para in enumerate(temp_doc.paragraphs):
    paragraphs = para.text
    if is_references_heading(para.text):
        cut_index = i       
        break
if cut_index is not None:
    for i in range(len(temp_doc.paragraphs) - 1, cut_index - 1, -1):
        p = temp_doc.paragraphs[i]._element
        p.getparent().remove(p)

#this part will just leave the answers in the doc removing everything else
def is_question_line(para):
    text = para.text.strip()
    lower_text = text.lower()
    words = text.split()

    #print(f"\n[CHECKING]: {text}")

    if not text or len(words) < 3:
        #print("→ ❌ Excluded: too short or empty")
        return False

    if EXCLUDE_PREFIXES.match(text):
        #print("→ ❌ Excluded: matches EXCLUDE_PREFIXES")
        return False

    if re.match(r"^[A-Z]{2,}\s*[-–]\s*\d+", text):
        #print("→ ❌ Excluded: course code format")
        return False

    if lower_text.startswith(("writing assignment", "list of references")):
        #print("→ ❌ Excluded: writing assignment or references section")
        return False

    if re.match(r"^q\d+[:\.)]?\s*(references|list of references)$", lower_text):
        #print("→ ❌ Excluded: fake Q-label for references")
        return False

    if all(w.isalpha() for w in words) and text.isupper():
        #print("→ ❌ Excluded: all uppercase heading")
        return False

    if re.match(r"^\(?[A-Ca-c]\)?\s*[\w\s\-]{1,40}:\s*$", text):
        #print("→ ❌ Excluded: side heading like (A) Topic:")
        return False

    if re.match(r"^[A-Z][\w\s\-]{2,40}:\s*$", text):
        #print("→ ❌ Excluded: short heading ending in colon")
        return False

    if lower_text.strip() in {"assignment", "worksheet", "homework", "references", "list of references", "bibliography"}:
        #print("→ ❌ Excluded: full paragraph is academic admin word")
        return False

    if re.match(r"^\d{1,2}\s*[\).]?\s*(if|would|should|could|what|why|how|when|where|who|do|does|did|is|are|can|will)\b", lower_text):
        #print("→ ✅ Included: starts with number and question word")
        return True

    if text.endswith('?'):
        #print("→ ✅ Included: ends with question mark")
        return True

    if QUESTION_REGEX_PATTERN.match(text):
        #print("→ ✅ Included: matches QUESTION_REGEX_PATTERN")
        return True

    if re.match(r"^(\d+[\).]|[-*])\s", text):
        #print("→ ✅ Included: starts with list-style numbering")
        return True

    if re.match(r"^\d{1,2}\s*[\).]?\s*(%s)\b" % "|".join(QUESTION_KEYWORDS), lower_text):
        #print("→ ✅ Included: starts with numbered question keyword")
        return True

    if (
        any(run.bold for run in para.runs if run.text.strip()) and
        not text.endswith(':') and
        not re.match(r"^[A-Z][\w\s\-]{2,40}:\s*", text) and
        len(words) >= 6 and
        any(kw in lower_text for kw in QUESTION_KEYWORDS) and
        not re.search(r"\([\w\s&\-,]+,\s*\d{4}\)", text)  # contains citation? then it's answer
    ):
        #print("→ ✅ Included: bold with keyword, not subheading, no citation")
        return True

    #print("→ ❌ Not a question")
    return False

question_exists = any(is_question_line(para) for para in temp_doc.paragraphs)
if question_exists:
    question_found = False
    for para in list(temp_doc.paragraphs):  # list() so we can safely modify
        if not question_found:
            if is_question_line(para):
                question_found = True
                para._element.getparent().remove(para._element)
                #print(f"\n→ ✅ Found first question and removed: {para.text.strip()}")
            else:
                para._element.getparent().remove(para._element)
                #print(f"🗑️ Removed above-question paragraph: {para.text.strip()}")
        else:
            if is_question_line(para):
                para._element.getparent().remove(para._element)
                #print(f"🗑️ Removed question below: {para.text.strip()}")


# Use the precomputed variable
if not question_exists:
    #print("\nℹ️ No questions found — cleaning for valid answers only...\n")


    def is_valid_answer(text):
        text = text.strip()

        if not text:
            return False

        if len(text.split()) <= 10:
            return False

        # Skip date formats
        if re.match(r'^\d{1,2}[-/]\d{1,2}[-/]\d{2,4}$', text):  # 12/25/2023
            return False
        if re.match(r'^[A-Za-z]+\s+\d{1,2},\s*\d{4}$', text):  # December 25, 2023
            return False

        return True

    # Remove all non-answer lines
    for para in list(temp_doc.paragraphs):
        if not is_valid_answer(para.text):
            para._element.getparent().remove(para._element)
            #print(f"🗑️ Removed non-answer line: {para.text.strip()}")

In [6]:
for para in temp_doc.paragraphs:
    paragraphs = para.text
    print(paragraphs)
    print()

(A) Potential unethical:

Fred creates an ethical conflict by joining Phaust, risking misuse of Chemitoil's information from his earlier work. Phaust gives importance to profit over environmental protection, exploiting Mexico's less stringent regulations like no lining on ponds, causing toxic wastewater to harm resources and communities.

(B) Technical Problems:

The lack of safety testing on equipment from a new vendor caused catastrophic results. Using substandard controllers to cut costs due to the reduced budget (20% cut) caused technical problems, as the controller software had defects.

(C) Economic Problems:

Budget constraints and schedule pressure, to compete with Chemitoil, are the economic problems. Environmental pollution from untreated wastewater is an economic problem.

Fred, the chemical engineer, manages designing and approving the automated chemical processing plant for the new product. Wally, Fred’s supervisor, guides Fred throughout the process. Chuck, Vice president

In [7]:
abbreviations = [
    "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Sr.", "Jr.", "St.",
    "vs.", "e.g.", "i.e.", "etc.", "Jan.", "Feb.", "Mar.", "Apr.",
    "Jun.", "Jul.", "Aug.", "Sep.", "Oct.", "Nov.", "Dec."
]
placeholder = "§§§"
split_pattern = re.compile(r'(?<=[.?!])\s+(?=[A-Z])')

sentences = []
for para in temp_doc.paragraphs:
    text = para.text.strip()
    if not text:
        continue

    masked = text
    for abbr in abbreviations:
        masked = masked.replace(abbr, abbr.replace('.', placeholder))

    split = split_pattern.split(masked)
    for s in split:
        s = s.replace(placeholder, '.').strip()
        if s:
            sentences.append(s)

for sentence in sentences:
    print(sentence)
    print()

(A) Potential unethical:

Fred creates an ethical conflict by joining Phaust, risking misuse of Chemitoil's information from his earlier work.

Phaust gives importance to profit over environmental protection, exploiting Mexico's less stringent regulations like no lining on ponds, causing toxic wastewater to harm resources and communities.

(B) Technical Problems:

The lack of safety testing on equipment from a new vendor caused catastrophic results.

Using substandard controllers to cut costs due to the reduced budget (20% cut) caused technical problems, as the controller software had defects.

(C) Economic Problems:

Budget constraints and schedule pressure, to compete with Chemitoil, are the economic problems.

Environmental pollution from untreated wastewater is an economic problem.

Fred, the chemical engineer, manages designing and approving the automated chemical processing plant for the new product.

Wally, Fred’s supervisor, guides Fred throughout the process.

Chuck, Vice pres

In [8]:
import pickle
with open('sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)

In [17]:
#!jupyter nbconvert --to script gemini_api_model.ipynb

In [18]:
#from gemini_api_model import parsed_pattern_list

In [19]:
import json

with open("parsed_pattern_list.json", "r") as f:
    parsed_pattern_list = json.load(f)

In [20]:
print(parsed_pattern_list)

[{'pattern': '(?i)\\b(a|an)\\b\\s+\\w+', 'description': "Detects 'a' or 'an' followed by a word"}, {'pattern': '(?i)^[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)*\\s+(?:\\w+\\s+){0,20}\\w+', 'description': 'Checks if a sentence starts with an acronym'}, {'pattern': '(?i)\\b(\\w+)\\s+\\1\\b', 'description': 'Detects repeated words within a sentence'}, {'pattern': '(?i)^(?:(?:(?:in|with|a|an|and|it|is|its|this|these|that)\\s+)|(?:in|with|a|an|and|it|is|its|this|these|that)\\b).*', 'description': 'Detects sentences starting with prohibited words'}, {'pattern': '(?i)^\\s*(?:(?:this|these|in|that|with|a|an|and|it|is|its)\\b|it is)\\s+.*', 'description': 'Detects sentences starting with prohibited words'}, {'pattern': '(?i)\\b(?:one|two|three|four|five|six|seven|eight|nine)\\b', 'description': 'Detects numbers less than 10 (spelled out)'}, {'pattern': '(?i)^\\d', 'description': 'Detects sentences starting with a number'}, {'pattern': '(?i)\\b(Additionally|Many|Some|Better|Best|Ah|May|Almost|Difficult|Migh

In [21]:
import google.generativeai as genai

genai.configure(api_key="AIzaSyAKVcYFqUH-SIWpsiz127xE3cZmmgNgG7Y")
model = genai.GenerativeModel('gemini-1.5-flash-latest', generation_config={"temperature": 0.0})

In [22]:
def analyze_sentence(single_sentence, pattern_list):
    SYSTEM_PROMPT = """You are a professional academic writing assistant.

Analyze the sentence using these formatting rules:
{patterns_list}

Guidelines:
- Ignore headings or labels at the beginning (e.g., "(A) Ethical concern:").
- Use standard grammar rules (e.g., "an" before vowels).
- Suggest **up to 2 direct edits** only if rules are violated.
- Edits must be short actions (e.g., Remove "that", Replace "an" with "the").
- No explanations, rewording, or full rewrites.
- If no issues, return nothing.

Respond exactly like this:

[Analysis]
<edit1; edit2>  or leave blank
"""


    formatted_rules = "\n".join([f"- {p['description']} (Pattern: {p['pattern']})" for p in pattern_list])
    full_prompt = f"{SYSTEM_PROMPT.format(patterns_list=formatted_rules)}\n\nOriginal sentence:\n{single_sentence}\n"

    try:
        response = model.generate_content(full_prompt)
        response_text = response.text

        # Extract analysis only
        analysis = re.search(r'\[Analysis\](.*)', response_text, re.DOTALL)
        analysis_text = analysis.group(1).strip() if analysis else response_text.strip()

        return {single_sentence: analysis_text}

    except Exception as e:
        print(f"LLM error: {e}")
        return {single_sentence: "Error during LLM analysis."}


In [None]:
sentence_analysis_dict = {}
count = 0

for sentence in clean_sentences:
    if count >= 10:
        break

    result = analyze_sentence(sentence, parsed_pattern_list)

    if isinstance(result, dict):
        for sent, change in result.items():
            sentence_analysis_dict[sent] = change
            print(f"\nSentence: {sent.strip()}")
            print(f"Change: {change.strip() if change else 'No issues'}")

    time.sleep(5)
    count += 1


In [26]:
with open("sentence_analysis_dict.json", "w", encoding="utf-8") as f:
    json.dump(sentence_analysis_dict, f, ensure_ascii=False, indent=2)

In [21]:
import json 

with open("sentence_analysis_dict.json", "r", encoding="utf-8") as f:
    sentence_analysis_dict = json.load(f)

In [22]:
def print_sentence_analysis_table(analysis_dict, width=60):
    print("\n{:<{w}} | {}".format("Sentence", "Suggested Change", w=width))
    print("=" * (width + 3 + 60))  # Adjust total width

    for sentence, change in analysis_dict.items():
        sentence_lines = sentence.strip().replace("\n", " ").split(". ")
        change_text = change.strip() if change else "No issues"

        # Print sentence and change
        for i, line in enumerate(sentence_lines):
            line = line.strip()
            if not line:
                continue
            if i == 0:
                print("{:<{w}} | {}".format(line[:width], change_text, w=width))
            else:
                print("{:<{w}} |".format(line[:width], w=width))
        print("-" * (width + 3 + 60))  # Separator line

# Example usage
print_sentence_analysis_table(sentence_analysis_dict, width=70)


Sentence                                                               | Suggested Change
Potential unethical: Fred creates an ethical conflict by joining Phaus | Remove "Potential"; Replace "an" with "a"
-------------------------------------------------------------------------------------------------------------------------------------
Phaust gives importance to profit over environmental protection, explo | Remove "less"; Replace "like" with "such as"
-------------------------------------------------------------------------------------------------------------------------------------
Technical Problems: The lack of safety testing on equipment from a new | Remove "a";
-------------------------------------------------------------------------------------------------------------------------------------
Using substandard controllers to cut costs due to the reduced budget ( | Remove "as"; Replace "the" with "a"
--------------------------------------------------------------------------------