In [None]:
from openai import OpenAI
import json
import os


client = OpenAI()

def openai_full_rubric_eval(text, architect_name, debug=False):
    if debug:
        print("🧠 ChatGPT evaluating extended rubric")

    system_prompt = f"""
You are an architecture professor grading a student's final submission about {architect_name}.
The submission includes:
- A 750-word biography of the architect
- A discussion and visual documentation of at least 10 buildings
- MLA-formatted citations
- Image captions with proper attribution
- A student bio and photo
- A visually polished and clearly structured document

Grade the submission across the following rubric categories, each out of 5:
1. Architect Selection & Scope
2. Organization & Document Setup
3. Image Citation & Attribution
4. Coverage of 10 Buildings
5. Student Bio & Photo
6. Presentation Polish

You must include and score **all 6 rubric categories** regardless of whether the content is present. 
If a section is missing or inadequate, explain briefly and assign a lower score accordingly.

Return your response in this format:
```
Here's a grading of the student's submission based on the provided rubric:

* **1. Architect Selection & Scope (X/5):** ...
* **2. Organization & Doc Setup (X/5):** ...
* **3. Image Citation & Attribution (X/5):** ...
* **4. Coverage of 10 Buildings (X/5):** ...
* **5. Student Bio & Photo (X/5):** ...
* **6. Presentation Polish (X/5):** ...

**Overall Comments:**
<paragraph summarizing the work>
```
Only provide the formatted text shown above. Do not omit any of the six rubric categories, even if evidence is missing.
"""

    user_content = text

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content}
        ]
    )

    return response.choices[0].message.content.strip()


COGS 160 Auto-Grader Notebook for Architect Assignments

Imports

In [2]:
import re
import json
import fitz  
from PIL import Image
from io import BytesIO
from urllib.parse import urlparse
import spacy
import openai
from dotenv import load_dotenv
from IPython.display import display
nlp = spacy.load("en_core_web_sm")
load_dotenv()

False

Rubric

In [3]:
rubric = {
    "architect_chosen": 5,
    "bio_750_words": 10,
    "bio_structure": 10,
    "bio_references": 10,
    "10_buildings_with_images": 15,
    "image_quality": 10,
    "image_citations": 10,
    "personal_bio_photo": 5,
    "doc_and_slides": 5,
    "image_relevance": 10,
    "presentation_polish": 20,
}

 Extract text from PDF
 

In [4]:
pdf_path = "/Users/heather/Desktop/Work/XR Lab/A1 Submissions/davidmatthew_LATE_134808_14949557_COGS 160_ A1.pdf"

In [5]:
def extract_text_from_pdf(pdf_path):
    print(f"🔍 Extracting text from: {pdf_path}")
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    print("✔ Extracted text from PDF")
    return text

 Extract images from PDF

In [6]:

def extract_images_from_pdf(pdf_path, min_width=200, save_folder=None):
    import os
    import fitz  # PyMuPDF

    doc = fitz.open(pdf_path)

    # Use a new folder for saving images
    if save_folder is None:
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]
        save_folder = os.path.join(os.path.dirname(pdf_path), f"{base_name}_images")

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    image_data = []
    for page_index in range(len(doc)):
        for img_index, img in enumerate(doc[page_index].get_images(full=True)):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.width < min_width:
                continue
            img_path = os.path.join(save_folder, f"page{page_index+1}_img{img_index+1}.png")
            if pix.n < 5:
                pix.save(img_path)
            else:
                pix1 = fitz.Pixmap(fitz.csRGB, pix)
                pix1.save(img_path)
            image_data.append(img_path)

    return image_data


Evaluate biography structure & word count

In [7]:
def evaluate_biography(text):
    print(" Evaluating biography: checking word count and required sections")
    result = {}
    doc = nlp(text)
    result["word_count"] = len([token.text for token in doc if token.is_alpha])
    required_sections = ["who they are", "studied", "first building", "significance", "influence"]
    section_hits = sum([1 for section in required_sections if section.lower() in text.lower()])
    result["structure_score"] = int((section_hits / len(required_sections)) * rubric["bio_structure"])
    result["score"] = rubric["bio_750_words"] if result["word_count"] >= 700 else int((result["word_count"] / 750) * rubric["bio_750_words"])
    return result

Bio Evaluation

In [8]:
def openai_bio_score(text, architect_name, debug=False):
    if debug:
        print(f"🧠 Sending biography text to ChatGPT for evaluation of {architect_name}")

    # Initial grading prompt
    prompt = f"""
You are grading a student's biography of the architect {architect_name}.
Evaluate:
- Who they are
- What they’re famous for
- Where they studied
- Significance in architecture
- Influence of buildings
- Types of buildings
- First building attributed
Give a score out of 10 and a 1-paragraph feedback.
"""

    # First GPT evaluation
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": text}
        ]
    )

    first_feedback = response.choices[0].message.content.strip()

    if debug:
        print("💬 Initial GPT Feedback:\n", first_feedback)

    # Reconsideration prompt
    retry_prompt = """
Was this scoring too harsh? Re-evaluate the student’s biography with more weight on effort and alignment with the assignment instructions. 
Still provide a score out of 10 and a paragraph explanation.
"""

    # GPT reconsideration
    second_response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": retry_prompt},
            {"role": "user", "content": first_feedback}
        ]
    )
    
    reconsidered_feedback = second_response.choices[0].message.content.strip()

    if debug:
        print("✅ Reconsidered GPT Feedback:\n", reconsidered_feedback)

    return reconsidered_feedback

Extract references from text

In [9]:
def extract_references_from_text(text):
    print("🔍 Extracting references from text")
    lines = text.split("\n")
    references = []
    for line in lines:
        if re.search(r"\(\d{4}\)", line) and any(x in line.lower() for x in ["doi", "archdaily", "e-architect", "https://"]):
            references.append(line.strip())
    return references

Score references

In [10]:
def evaluate_references(ref_list):
    print(" Evaluating references")
    if not ref_list:
        return {"valid_references": 0, "score": 0}

    joined_refs = "\n".join(ref_list)
    prompt = f"""
You are an academic writing assistant.
Below is a list of references extracted from a student's architecture assignment:

{joined_refs}

Evaluate the overall quality of these references based on the following:
- Are they properly formatted in APA style?
- Are they from credible sources (e.g., books, peer-reviewed journals, respected architecture websites)?
- Are there enough academic references (minimum of 5 is ideal)?

Give a score out of 10 for reference quality, and provide a short justification.
"""
    response = openai.ChatCompletion.create([prompt])
    print("📚 Openai Reference Evaluation:\n", response.text)
    score_match = re.search(r"(\d{1,2})/10", response.text)
    score = int(score_match.group(1)) if score_match else min(len(ref_list), rubric["bio_references"])
    return {"valid_references": len(ref_list), "score": score}

Score image resolution

In [11]:
def evaluate_image_quality(image_data):
    print("🔍 Evaluating image resolution")
    high_res_count = 0

    for img_path in image_data:
        try:
            with Image.open(img_path) as img:
                width, height = img.size
                if width >= 1000 and height >= 1000:  # arbitrary high-res threshold
                    high_res_count += 1
        except Exception as e:
            print(f"⚠️ Error loading image {img_path}: {e}")
            continue

    score = int((high_res_count / max(1, len(image_data))) * rubric["image_quality"])
    return {"high_res_count": high_res_count, "score": score}


Openai: score image relevance

In [12]:
def evaluate_image_relevance(image_data, architect_name, debug=False):
    print("🔍 Evaluating image relevance using Openai")
    relevance_scores = []
    for img in image_data:
        prompt = f"""
You are evaluating whether this image is relevant to a project on the architect {architect_name}.
1. Does this image depict a building by {architect_name}? If yes, say which building if you can.
2. Is this an interior or exterior shot?
3. Is this a high-quality academic image that clearly shows architectural features (composition, lighting, layout)?
Give a score out of 10 for academic relevance with a brief justification.
"""
        try:
            response = vision_openai.ChatCompletion.create([img["image"], prompt])
            if debug:
                print(f"📷 Openai Vision Feedback (Page {img['page']}):", response.text)
            match = re.search(r"(\d{1,2})/10", response.text)
            score = int(match.group(1)) if match else 5
        except:
            score = 5
        relevance_scores.append(score)
    avg_score = sum(relevance_scores) / max(1, len(relevance_scores))
    return {"avg_score": avg_score, "score": int((avg_score / 10) * rubric["image_relevance"])}

Openai: score remaining rubric items

In [13]:

from openai import OpenAI
import json

client = OpenAI()

def openai_full_rubric_eval(text, architect_name, debug=False):
    if debug:
        print("🧠 ChatGPT evaluating extended rubric")

    system_prompt = f"""
You are an architecture professor grading a student's final submission about {architect_name}.
The submission includes:
- A 750-word biography of the architect
- A discussion and visual documentation of at least 10 buildings
- MLA-formatted citations
- Image captions with proper attribution
- A student bio and photo
- A visually polished and clearly structured document

Grade the submission across the following rubric categories, each out of 5:
1. Architect Selection & Scope
2. Organization & Document Setup
3. Image Citation & Attribution
4. Coverage of 10 Buildings
5. Student Bio & Photo
6. Presentation Polish

You must include and score **all 6 rubric categories** regardless of whether the content is present. 
If a section is missing or inadequate, explain briefly and assign a lower score accordingly.

Return your response in this format:
```
Here's a grading of the student's submission based on the provided rubric:

* **1. Architect Selection & Scope (X/5):** ...
* **2. Organization & Doc Setup (X/5):** ...
* **3. Image Citation & Attribution (X/5):** ...
* **4. Coverage of 10 Buildings (X/5):** ...
* **5. Student Bio & Photo (X/5):** ...
* **6. Presentation Polish (X/5):** ...

**Overall Comments:**
<paragraph summarizing the work>
```
Only provide the formatted text shown above. Do not omit any of the six rubric categories, even if evidence is missing.
"""

    user_content = text

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content}
        ]
    )

    return response.choices[0].message.content.strip()


Using chain of thought 

In [14]:

from openai import OpenAI
import json

client = OpenAI()

def openai_full_rubric_eval(text, architect_name, debug=False):
    if debug:
        print("🧠 ChatGPT evaluating extended rubric")

    system_prompt = f"""
You are an architecture professor grading a student's final submission about {architect_name}.
The submission includes:
- A 750-word biography of the architect
- A discussion and visual documentation of at least 10 buildings
- MLA-formatted citations
- Image captions with proper attribution
- A student bio and photo
- A visually polished and clearly structured document

Grade the submission across the following rubric categories, each out of 5:
1. Architect Selection & Scope
2. Organization & Document Setup
3. Image Citation & Attribution
4. Coverage of 10 Buildings
5. Student Bio & Photo
6. Presentation Polish

You must include and score **all 6 rubric categories** regardless of whether the content is present. 
If a section is missing or inadequate, explain briefly and assign a lower score accordingly.

Return your response in this format:
```
Here's a grading of the student's submission based on the provided rubric:

* **1. Architect Selection & Scope (X/5):** ...
* **2. Organization & Doc Setup (X/5):** ...
* **3. Image Citation & Attribution (X/5):** ...
* **4. Coverage of 10 Buildings (X/5):** ...
* **5. Student Bio & Photo (X/5):** ...
* **6. Presentation Polish (X/5):** ...

**Overall Comments:**
<paragraph summarizing the work>
```
Only provide the formatted text shown above. Do not omit any of the six rubric categories, even if evidence is missing.
"""

    user_content = text

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_content}
        ]
    )

    return response.choices[0].message.content.strip()


Score aggregation

In [15]:
def generate_scorecard(scores):
    print(" Generating scorecard")
    total = sum([v["score"] for v in scores.values()])
    return {
        "scorecard": {k: v["score"] for k, v in scores.items()},
        "final_score": total,
        "grade": "A" if total >= 90 else "B" if total >= 80 else "C" if total >= 70 else "D",
        "details": scores
    }

Main pipeline

In [16]:
def run_autograder(pdf_path, architect_name):
    print(" Starting pipeline")
    doc_text = extract_text_from_pdf(pdf_path)
    images = extract_images_from_pdf(pdf_path)
    references = extract_references_from_text(doc_text)

    scores = {
        "bio_750_words": {"score": evaluate_biography(doc_text)["score"]},
        "bio_structure": {"score": evaluate_biography(doc_text)["structure_score"]},
        "bio_references": evaluate_references(references),
        "image_quality": evaluate_image_quality(images),
        "image_relevance": evaluate_image_relevance(images, architect_name)
    }

    rubric_feedback = openai_full_rubric_eval(doc_text, architect_name)
    print("\n📋 GPT Rubric Feedback:\n", rubric_feedback)

    bio_feedback = openai_bio_score(doc_text, architect_name)
    print("\n🧠 GPT Bio Score:\n", bio_feedback)

    print(" Evaluation complete.")
    return scores


RESULTS

In [17]:
result = run_autograder(pdf_path, "Kazuyo Sejima")  
print(json.dumps(result, indent=2))

 Starting pipeline
🔍 Extracting text from: /Users/heather/Desktop/Work/XR Lab/A1 Submissions/davidmatthew_LATE_134808_14949557_COGS 160_ A1.pdf
✔ Extracted text from PDF
🔍 Extracting references from text
 Evaluating biography: checking word count and required sections
 Evaluating biography: checking word count and required sections
 Evaluating references
🔍 Evaluating image resolution
🔍 Evaluating image relevance using Openai

📋 GPT Rubric Feedback:
 Here's a grading of the student's submission based on the provided rubric:

* **1. Architect Selection & Scope (5/5):** Good choice of architect, with a clear focus on her notable works. The report covers the full range of Sejima's work from small residential models to large public projects. All aspects of her career are explored, including her partnership with Ryue Nishizawa and the establishment of SANAA.
  
* **2. Organization & Document Setup (5/5):** The report is very well-organized, with a rational structure that includes biography, 