# Notebook Setup

In [None]:
%pip install -q -U google-genai
%pip install PyPDF2==3.0.1
%pip install rdflib
%pip install openai
%pip install pandas
%pip install asyncio
%pip install nest_asyncio
%pip install toml
%pip install tabulate
%pip install "C:\Users\lucas\Developer\plantuml-ontology-validator"

In [None]:
from google import genai
from google.genai import types
from google.auth import default
from openai import OpenAI
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
import os
import toml
import json
import base64
import requests
import PyPDF2
import pathlib
import pandas
import asyncio
import nest_asyncio 
from ontoProbeLib.ontoProbe import validate

########################################################################################################################
#                                                  PDF Handling
########################################################################################################################

def split_pdf(pdf_path, num_chunks):
  """Splits a PDF into chunks and saves them as new files."""

  base_filename = os.path.splitext(os.path.basename(pdf_path))[0]
  folder_path = "/content/" + base_filename
  os.makedirs(folder_path, exist_ok=True)

  with open(pdf_path, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)
    chunk_size = num_pages // num_chunks  # Pages per chunk

    for i in range(num_chunks):
      start_page = i * chunk_size
      end_page = min((i + 1) * chunk_size, num_pages)  # Handle last chunk

      pdf_writer = PyPDF2.PdfWriter()
      for page_num in range(start_page, end_page):
        pdf_writer.add_page(pdf_reader.pages[page_num])

      chunk_filename = os.path.join(folder_path, f"{base_filename}_chunk_{i + 1}.pdf")
      with open(chunk_filename, 'wb') as chunk_file:
        pdf_writer.write(chunk_file)


def read_pdf_bytes(pdf_path):
  """Converts a PDF file to bytes."""
  filepath = pathlib.Path(pdf_path)
  return filepath.read_bytes()


def pdf_to_base64(pdf_path):
  """Converts a PDF file to base64."""
  return base64.b64encode(read_pdf_bytes(pdf_path)).decode('utf-8')


def read_pdf_text(pdf_path):
  """Converts a PDF file to text."""
  text = ""
  try:
    with open(pdf_path, 'rb') as file:
      reader = PyPDF2.PdfReader(file)
      for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text()
  except FileNotFoundError:
    print(f"Error: PDF file not found at {pdf_path}")
    return None
  except Exception as e:
    print(f"Error extracting text from PDF: {e}")
    return None
  return text

########################################################################################################################
#                                                   Ontology Handling
########################################################################################################################

def get_ontology_classes(rdf_data, format='ttl'):
  """Returns a list of all classes in the ontology."""
  classes = set()
  try:
    g = Graph().parse(data=rdf_data, format=format)
    for s, p, o in g.triples((None, RDF.type, RDFS.Class)):
      classes.add(s)
  except:
    print("Error parsing ontology")
  return classes


def get_ontology_properties(rdf_data, format='ttl'):
  """Returns a list of all properties in the ontology."""
  properties = set()
  try:
    g = Graph().parse(data=rdf_data, format=format)
    for s, p, o in g.triples((None, RDF.type, RDF.Property)):
      properties.add(s)
  except:
    print("Error parsing ontology")
  return properties


def get_ontology_instances(rdf_data, format='ttl'):
  """Returns a list of all instances in the ontology."""
  classes = get_ontology_classes(rdf_data, format)

  instances = set()
  try:
    g = Graph().parse(data=rdf_data, format=format)
    for s, p, o in g.triples((None, RDF.type, None)):
      if o in classes:
        instances.add(s)
  except:
    print("Error parsing ontology")
  return instances

########################################################################################################################
#                                                  Global Configuration
########################################################################################################################

nest_asyncio.apply()

with open("./config.toml", 'r') as f:
  CONFIG = toml.load(f)

pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_colwidth', None)

# ***Ontology Generation via LLMs for Technical Documents***

## Tools

- Testing and Automatization: Python
	- PDF parsing: PyPDF
	- Grammar Parsing: Textx
	- Ontology Handling: rdflib
- Ontology Format Validation: http://ttl.summerofcode.be/
- Ontology Visualizer: https://webprotege.stanford.edu/
- LLM - Google AI Studio: https://aistudio.google.com/
- LLM - OpenRouter: https://openrouter.ai/

## Used Models

Data taken from:
- https://lmarena.ai/ at date 2025/19/02
- https://llm-stats.com/ at date 2025/19/02

| Name | Rank | License | Context |
|-|-|-|-|
| Gemini-2.0-Flash-Thinking-Exp-01-21 | 2 | Propietary | 1.000.000 |
| Gemini-2.0-Flash-001 | 5 | Propietary | 1.048.576 |
| Grok 2 | 18 | Propietary | 128.000 |
| CPT 4o | 2 | Propietary | 128.000 |

# 1 **Blind-Generation**



## 1.2 Entry

### 1.2.1 Documents

In [3]:
doc_1 = "./docs/subject-guide/upm-gpr.pdf"
doc_2 = "./docs/subject-guide/upm-cs.pdf"

### 1.2.2 Prompts

In [None]:
ppt_1 = open("./blind-gen/ppt-1.txt", "r").read()
ppt_2 = open("./blind-gen/ppt-2.txt", "r").read()
ppt_3 = open("./blind-gen/ppt-3.txt", "r").read()

### 1.3 Definition of "Important Information"


- I1 Course Name
- I2 Course Code
- I3 Course University Name
- I4 Course Career Name
- I5 Course Career Code
- I6 Course Career Center
- I7 Course Academic Year
- I8 Course Course
- I9 Course Semester
- I10 Course Credits
- I11 Course Mandatory
- I12 Course Professors
  - I12.1 Course Professor Name
  - I12.2 Course Professor Email
  - I12.3 Course Professor Tutor Hours
  - I12.4 Course Professor's Office
- I13 Recommended Previous Coursed Courses
- I14 Recommended Other Previous Knowledge
- I15 Course Competencies
  - I15.1 Course Competency Code
  - I15.2 Course Competency Description
- I16 Course Learning Results
  - I16.1 Course Learning Results Code
  - I16.2 Course Learning Results Description
- I17 Course Description
- I18 Course Topics
- I19 Course Activities
- I20 Course Evaluation Criteria
- I21 Course Didactic Resources

## 1.6 Tests


### 1.6.1 Initial Tests [1.x.x]

#### 1.6.1.1 With Gemini 2.0 Flash [1.1.x]

##### **Implementation**

In [None]:
prompt = ppt_1
document = doc_1
path_results = "/content/gemini-2.0-flash-result.ttl"

client = genai.Client(api_key=CONFIG["keys"]["google-ai-api"])
response = client.models.generate_content(
  model="gemini-2.0-flash",
  contents=[
    prompt + " Pdf: " + read_pdf_text(document)
  ]
)

with open(path_results, 'w') as f:
  f.write(response.text)

#### 1.6.1.2 Gemini 2.0 Flash Thinking Experimental Free [1.2.x]

##### **Implementation**

In [None]:
prompt = ppt_2
document = doc_1
path_results = "/content/gemini-2.0-flash-thinking-experimental-free-result.ttl"

# Using Open Router
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=CONFIG["keys"]["open-router-api"],
)

response = client.chat.completions.create(
  model="google/gemini-2.0-flash-thinking-exp:free",
  messages=[
    {
      "role": 'user',
      "content": prompt + " Pdf: " +  read_pdf_text(document)
    }
  ]
)

try:
  with open(path_results, 'w') as f:
    f.write(response.choices[0].message.content)
except Exception as e:
  print(response)



#### 1.6.1.3 Grok 2 [1.3.x]

##### **Implementation**

Using https://grok.com/ feed ppt-1 and doc-1 to the LLM.

##### **Results**

In [None]:
pandas.DataFrame.from_records(gc.open_by_url(CONFIG["blind-gen"]["validation"]["url"]).get_worksheet_by_id(221064902).get_all_values())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
0,Test Number,Prompt,Document,Result,I1 - Course Name,I2 - Course Code,I3 - Course University Name,I4 - Course Career Name,I5 - Course Career Code,I6 - Course Career Center,I7 - Course Academic Year,I8 - Course Course,I9 - Course Semester,I10 - Course Credits,I11 - Course Mandatory,I12 - Course Professors,I12.1 - Course Professor Name,I12.2 - Course Professor Email,I12.3 - Course Professor Tutor Hours,I12.4 - Course Professor's Office,I13 - Recommended Previous Coursed Courses,I14 - Recommended Other Previous Knowledge,I15 - Course Competencies,I15.1 - Course Competency Code,I15.2 - Course Competency Description,I16 - Course Learning Results,I16.1 - Course Learning Results Code,I16.2 - Course Learning Results Description,I17 - Course Description,I18 - Course Topics,I19 - Course Activities,I20 - Course Evaluation Criteria,I21 - Course Didactic Resources,Valid,Comprehensiveness
1,1,ppt-1,doc-1,blind-gen_test-1.3.1,,,,,,,,Error in the Value,,,,,,,,,,,,,,,,,,N7A,,,,YES,03793103448
2,2,ppt-1,doc-1,blind-gen_test-1.3.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NO,
3,3,ppt-1,doc-1,blind-gen_test-1.3.3,,,,,,,,,,,,Not related to Course,,,,,,,Not related to Course,,,Not related to Course,,,,,,,Not related to Course,YES,03793103448


##### **Observations**

- This model does not seem fit to generate complete ontologies.

#### 1.6.1.4 GPT 4o Reasoning [1.4.x]

##### **Implementation**

Using https://chatgpt.com/ feed ppt-1 and doc-1 to the LLM.

##### **Results**

In [None]:
pandas.DataFrame.from_records(gc.open_by_url(CONFIG["blind-gen"]["validation"]["url"]).get_worksheet_by_id(1658648548).get_all_values())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
0,Test Number,Prompt,Document,Result,I1 - Course Name,I2 - Course Code,I3 - Course University Name,I4 - Course Career Name,I5 - Course Career Code,I6 - Course Career Center,I7 - Course Academic Year,I8 - Course Course,I9 - Course Semester,I10 - Course Credits,I11 - Course Mandatory,I12 - Course Professors,I12.1 - Course Professor Name,I12.2 - Course Professor Email,I12.3 - Course Professor Tutor Hours,I12.4 - Course Professor's Office,I13 - Recommended Previous Coursed Courses,I14 - Recommended Other Previous Knowledge,I15 - Course Competencies,I15.1 - Course Competency Code,I15.2 - Course Competency Description,I16 - Course Learning Results,I16.1 - Course Learning Results Code,I16.2 - Course Learning Results Description,I17 - Course Description,I18 - Course Topics,I19 - Course Activities,I20 - Course Evaluation Criteria,I21 - Course Didactic Resources,Valid,Comprehensiveness
1,1,ppt-1,doc-1,blind-gen_test-1.4.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,YES,03103448276
2,2,ppt-1,doc-1,blind-gen_test-1.4.2,,Concatenated with I3,Concatenated with I2,Concatenated with I5,Concatenated with I4,Concatenated with I7,Concatenated with I6,,,,,,,,,,,,,,,,,,,Does not contain Sub-Topics,,,,YES,06206896552


### 1.6.2 Detailed Prompt and usage of Low Temperature  [2.x.x]

##### **Reasoning**

Defining a good prompt is essential to acquire a good result from a LLM. We will refine the first prompt to include LLM role and high level generation rules that can be applied to any document.

Adittionally, low temperatures allow for more consistent results, while high temperatures generates more creative results. In our case, we want the ontology to only contain the information defined in our documents, so a lower temperature is more apropiate.

##### **Generation**

1. File text extraction.
1. LLM feeding with extracted text, prompt and temperature.
2. Resulting Ontology

#### 1.6.2.1 With Gemini 2.0 Flash [2.1.x]

##### **Implementation**

In [None]:
prompt = ppt_3
document = doc_1
path_results = "/content/gemini-2.0-flash-result-t2.ttl"

# Using Google AI Studio
client = genai.Client(api_key=CONFIG["keys"]["google-ai-api"])
response = client.models.generate_content(
  model="gemini-2.0-flash",
  contents=[
    prompt + "\nTemperature: 0.6 " + "\nPdf: " + read_pdf_text(document)
  ]
)

ontology = response.text
with open(path_results, 'w') as f:
  f.write(ontology)

##### **Observations**

- There are more disconnected classes in low temperature values.
- There are more concatenation problems in low temperature values.
- Comprehensiveness seems to be lower in low temperatures.
- Overall, the results seems better than the ones in *1.1.x*.

#### 1.6.2.2 With Gemini 2.0 Flash Thinking Experimental Free [2.2.x]

##### **Implementation**

In [None]:
prompt = ppt_3
document = doc_1
path_results = "/content/gemini-2.0-flash-result-t2.ttl"

# Using Google AI Studio
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=CONFIG["keys"]["open-router-api"],
)

response = client.chat.completions.create(
  model="google/gemini-2.0-flash-thinking-exp:free",
  messages=[
    {
      "role": 'user',
      "content": prompt + "\nTemperature: 0.6 " + "\nPdf: " +  read_pdf_text(document)
    }
  ]
)

try:
  with open(path_results, 'w') as f:
    ontology = response.choices[0].message.content
    f.write(response.choices[0].message.content)
except Exception as e:
  print(response)

##### **Observations**

- The results seems better than the ones in *1.2.x*.
- In test *2.2.5* the LLM was capable of representing *I14*, something that has not been done until now. Meaning the LLM could understand and represent the abscense of data (Since I14 is defined in the document but has no data).
- This model seems fit to generate ontologies, although more fine-tuning is required.

# 2 **Guided-Generation**

## 2.1 Entry

- PDF
- PUML
- PROMPT

## 2.2 Tests

In [14]:
# google
## gemini-2.0-flash-thinking-exp-1219 (gemini-2.0-flash-thinking-exp)

# openrouter
## google/gemini-2.0-flash-thinking-exp:free (gemini-2.0-flash-thinking-exp)
## deepseek/deepseek-r1:free (deepseek-r1)
## google/gemini-2.5-pro-exp-03-25:free (gemini-2.5-pro-exp-03-25)

entries = [
    # google gemini-2.0-flash-thinking-exp-1219
    {
        "pdf_path": "./docs/subject-guide/upm-cs.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "google",
        "llm_model": "gemini-2.0-flash-thinking-exp-1219",
        "llm_name":"gemini-2.0-flash-thinking-exp",
        "test_quantity": 5
    },
    {
        "pdf_path": "./docs/subject-guide/upm-gpr.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "google",
        "llm_model": "gemini-2.0-flash-thinking-exp-1219",
        "llm_name":"gemini-2.0-flash-thinking-exp",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-fp.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "google",
        "llm_model": "gemini-2.0-flash-thinking-exp-1219",
        "llm_name":"gemini-2.0-flash-thinking-exp",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-so.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "google",
        "llm_model": "gemini-2.0-flash-thinking-exp-1219",
        "llm_name":"gemini-2.0-flash-thinking-exp",
        "test_quantity": 0
    },
    # openrouter deepseek/deepseek-r1:free
    {
        "pdf_path": "./docs/subject-guide/upm-cs.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "deepseek/deepseek-r1:free",
        "llm_name":"deepseek-r1",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/upm-gpr.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "deepseek/deepseek-r1:free",
        "llm_name":"deepseek-r1",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-fp.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "deepseek/deepseek-r1:free",
        "llm_name":"deepseek-r1",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-so.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "deepseek/deepseek-r1:free",
        "llm_name":"deepseek-r1",
        "test_quantity": 0
    },
    # openrouter google/gemini-2.5-pro-exp-03-25:free
    {
        "pdf_path": "./docs/subject-guide/upm-cs.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "google/gemini-2.5-pro-exp-03-25:free",
        "llm_name":"gemini-2.5-pro-exp-03-25",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/upm-gpr.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "google/gemini-2.5-pro-exp-03-25:free",
        "llm_name":"gemini-2.5-pro-exp-03-25",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-fp.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "google/gemini-2.5-pro-exp-03-25:free",
        "llm_name":"gemini-2.5-pro-exp-03-25",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-so.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "google/gemini-2.5-pro-exp-03-25:free",
        "llm_name":"gemini-2.5-pro-exp-03-25",
        "test_quantity": 0
    },
    # openrouter google/gemini-2.0-flash-exp:free
    {
        "pdf_path": "./docs/subject-guide/upm-cs.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "google/gemini-2.0-flash-exp:free",
        "llm_name":"gemini-2.0-flash-exp",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/upm-gpr.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "google/gemini-2.0-flash-exp:free",
        "llm_name":"gemini-2.0-flash-exp",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-fp.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "google/gemini-2.0-flash-exp:free",
        "llm_name":"gemini-2.0-flash-exp",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-so.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "google/gemini-2.0-flash-exp:free",
        "llm_name":"gemini-2.0-flash-exp",
        "test_quantity": 0
    },
    # groq meta-llama/llama-4-scout-17b-16e-instruct
    {
        "pdf_path": "./docs/subject-guide/upm-cs.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "groq",
        "llm_model": "meta-llama/llama-4-scout-17b-16e-instruct",
        "llm_name":"llama-4-scout-17b-16e",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/upm-gpr.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "groq",
        "llm_model": "meta-llama/llama-4-scout-17b-16e-instruct",
        "llm_name":"llama-4-scout-17b-16e",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-fp.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "groq",
        "llm_model": "meta-llama/llama-4-scout-17b-16e-instruct",
        "llm_name":"llama-4-scout-17b-16e",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-so.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "groq",
        "llm_model": "meta-llama/llama-4-scout-17b-16e-instruct",
        "llm_name":"llama-4-scout-17b-16e",
        "test_quantity": 0
    },
    # openrouter meta-llama/llama-4-maverick:free
    {
        "pdf_path": "./docs/subject-guide/upm-cs.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "meta-llama/llama-4-maverick:free",
        "llm_name":"llama-4-maverick",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/upm-gpr.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "meta-llama/llama-4-maverick:free",
        "llm_name":"llama-4-maverick",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-fp.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "meta-llama/llama-4-maverick:free",
        "llm_name":"llama-4-maverick",
        "test_quantity": 0
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-so.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "openrouter",
        "llm_model": "meta-llama/llama-4-maverick-free",
        "llm_name":"llama-4-maverick",
        "test_quantity": 0
    },
    # google gemini-2.5-pro-preview-06-05
        {
        "pdf_path": "./docs/subject-guide/upm-cs.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "google",
        "llm_model": "gemini-2.5-pro-preview-06-05",
        "llm_name":"gemini-2.5-pro-preview-06-05",
        "test_quantity": 10
    },
    {
        "pdf_path": "./docs/subject-guide/upm-gpr.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "google",
        "llm_model": "gemini-2.5-pro-preview-06-05",
        "llm_name":"gemini-2.5-pro-preview-06-05",
        "test_quantity": 10
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-fp.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "google",
        "llm_model": "gemini-2.5-pro-preview-06-05",
        "llm_name":"gemini-2.5-pro-preview-06-05",
        "test_quantity": 10
    },
    {
        "pdf_path": "./docs/subject-guide/sevius-so.pdf",
        "puml_path": "./guided-gen/puml/subject-guide.puml",
        "prompt_path": "./guided-gen/prompts/ppt-2.txt",
        "service": "google",
        "llm_model": "gemini-2.5-pro-preview-06-05",
        "llm_name":"gemini-2.5-pro-preview-06-05",
        "test_quantity": 10
    }
]

## 2.4 Implementation

In [3]:
from openai import AsyncOpenAI
import time
import json
import os

# ============================================ Utility ============================================
def read_plantuml(plantuml_file_path: str) -> str:
    with open(plantuml_file_path, "r") as plantuml_file:
        return plantuml_file.read()
    
def create_client_openrouter_async():
    return AsyncOpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=CONFIG["keys"]["open-router-api"],
    )

def create_client_googleai():
    return AsyncOpenAI(
        base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
        api_key=CONFIG["keys"]["google-ai-api"],
    )

def create_client_groq():
        return AsyncOpenAI(
        base_url="https://api.groq.com/openai/v1",
        api_key=CONFIG["keys"]["groq-api"],
    )

### 2.4.1 Generate Ontology

In [4]:
async def bulk_generate(pdf_file_path: str, plantuml_file_path: str, prompt_file_path: str, n: int, service:str, model: str):
    pdf_text = read_pdf_text(pdf_file_path)
    plantuml_text = read_plantuml(plantuml_file_path)
    prompt = open(prompt_file_path, "r").read()

    match service:
        case "openrouter": client = create_client_openrouter_async()
        case "google": client = create_client_googleai()
        case "groq": client = create_client_groq()
    
    request_tasks = []
    for i in range(n):
        request_tasks.append(
            asyncio.create_task(
                generate_ontology(i, pdf_text, plantuml_text, prompt, model, client)
            )
        )
    responses = await asyncio.gather(*request_tasks)

    return [
        {
            "ontology": responses[i]["ontology"],
            "responseTime": responses[i]["responseTime"]
        } for i in range(0, n) if responses[i] != None
    ]

async def generate_ontology(i, pdf_text: str, plantuml_text: str, prompt: str, model: str, client):
    print(f"{i} >> Requesting Ontology...")

    unix_sent_time = int(time.time())

    try:
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": 'user',
                    "content": "Temperature: 0.6" + prompt + " Pdf: " + pdf_text + " Plant UML: " + plantuml_text
                }
            ]
        )
        unix_received_time = int(time.time())
        response_time = unix_received_time - unix_sent_time

        print(f"{i} << Ontology Returned")
    
        try:
            content = response.choices[0].message.content
            content = content[content.find("<out>")+5:content.rfind("</out>")]

            return {
                "ontology": content,
                "responseTime": response_time
            }
        
        except Exception as e:
            print(f"{i} << [Error] [Response Failed] {e}")
            return None
        
    except Exception as e:
        print(f"{i} << [ERROR] [Request Failed] {e}")

    return None

### 2.4.2 Validate Ontology

In [5]:
async def bulk_validate(plantuml_file_path: str, generation_results: dict):
    plantuml_text = read_plantuml(plantuml_file_path)

    validation_tasks = []
    for result in generation_results:
        validation_tasks.append(
            asyncio.create_task(
                validate_ontology(plantuml_text, result["ontology"])
            )
        )

    validations = await asyncio.gather(*validation_tasks)
    return [json.loads(validations[i]) for i in range(len(validation_tasks))]

async def validate_ontology(plantuml_text: str, ontology_text: str) -> str:
    print(">> Validating Ontology...")
    result = ""
    try:
        result = validate.probe_string("http://example.org/ontology/", plantuml_text, ontology_text)
    except Exception as e:
        result = { "error": str(e) }
    result = json.dumps(result)
    print("<< Ontology Validated")
    return result

### 2.4.3 Save Results

In [6]:
def save_generation_results(results: list, output_name: str, last_id):
    print(">> Saving Generation Results")
    if not os.path.exists(f"./{output_name}"):
        os.makedirs(f"./{output_name}")

    for i in range(len(results)):
        with open(f"./{output_name}/{i + last_id + 1}-ontology.ttl", "w", encoding="utf-8") as output_file:
            output_file.write(results[i]["ontology"])
    print("<< Saving Generation Results")

def save_validation_results(results: list, output_name: str, last_id):
    print(">> Saving Validation Results")
    if not os.path.exists(f"./{output_name}"):
        os.makedirs(f"./{output_name}")

    for i in range(len(results)):
        with open(f"./{output_name}/{i + last_id + 1}-validation.json", "w", encoding="utf-8") as output_file:
            output_file.write(json.dumps(results[i]))
    print("<< Saving Validation Results")

### 2.4.4 Generate DataFrame

In [7]:
import pandas as pd

def generate_dataframe(pdf, prompt, llm, service, generation_results, validation_results, last_id):
    data = []

    # Iterate each result
    for i in range(0, len(validation_results)):
        entry = {}
        entry["Total Quality"]  = 0
        entry["PDF"]            = pdf
        entry["Prompt"]         = prompt
        entry["LLM"]            = llm
        entry["Service"]        = service
        entry["ID"]             = i + last_id + 1
        entry["Response Time"]  = int(generation_results[i]["responseTime"])
        entry["Valid"]          = False if "error" in validation_results[i] else True
        
        if entry["Valid"]:

            # Iterate each class in result
            for class_name, class_value in validation_results[i]["results"].items():
                entry[f"{class_name}.Existence"] = class_value["queries"]["existence"]["result"]
                print(class_name)
                entry[f"{class_name}.Presence"] = int(
                    class_value["queries"]["presence"]["result"][0]["presence"]
                    if len(class_value["queries"]["presence"]["result"]) > 0
                    else 0
                )
                
                for field_name, field_value in class_value["fields"].items():
                    entry[f"{class_name}.{field_name}.Existence"] = field_value["queries"]["existence"]["result"]
                    entry[f"{class_name}.{field_name}.Usage"] = float(
                        field_value["queries"]["usage"]["result"][0]["usage"]
                        if len(field_value["queries"]["usage"]["result"]) > 0 
                        else 0
                    )

                    # Calculate total Presence (sum of all presence_per_instance)
                    presence = 0
                    for instance in field_value["queries"]["presence_per_instance"]["result"]:
                        presence += int(instance["presence"])
                    entry[f"{class_name}.{field_name}.Presence"] = presence

                for rel_name, rel_value in class_value["relationships"].items():
                    entry[f"{class_name}.{rel_name}.Existence"] = rel_value["queries"]["existence"]["result"]

                    if ("usage" in rel_value["queries"]):
                        entry[f"{class_name}.{rel_name}.Usage"] = float(
                            rel_value["queries"]["usage"]["result"][0]["usage"]
                            if len(rel_value["queries"]["usage"]["result"]) > 0 
                            else 0
                        )
                        
                    if ("presence_per_instance" in rel_value["queries"]):
                        presence = 0
                        for instance in rel_value["queries"]["presence_per_instance"]["result"]:
                            presence += int(instance["presence"])
                        entry[f"{class_name}.{rel_name}.Presence"] = presence
        
        data.append(entry)

    return pd.DataFrame(data)

### 2.4.5 Generate Charts

In [8]:
import matplotlib.pyplot as plt
import numpy as np

def generate_charts(dataFrame, output_path: str):
    pie_chart_valid_ontologies(dataFrame, output_path)
    bar_chart_response_time(dataFrame, output_path)
    bar_chart_existence(dataFrame, output_path)
    bar_chart_success_ontology(dataFrame, output_path)
    bar_chart_total_usage_ontology(dataFrame, output_path)
    bar_chart_total_presence_ontology(dataFrame, output_path)
    box_plot_usage(dataFrame, output_path)
    box_plot_presence(dataFrame, output_path)

# 1. Pie chart of Valid responses vs. Invalid responses
def pie_chart_valid_ontologies(df, output_path: str):
    valid_counts = df['Valid'].value_counts()

    custom_labels = {True: 'Valid', False: 'Invalid'}
    custom_colors = {True: 'skyblue', False: 'salmon'}

    labels = [custom_labels[index] for index in valid_counts.index]
    colors = [custom_colors[index] for index in valid_counts.index]
    values = valid_counts.values

    plt.figure(figsize=(4, 4)) 
    plt.pie(values, 
        labels=labels, 
        autopct='%1.1f%%', 
        startangle=90, 
        colors=colors, 
        shadow=False)

    plt.title('Distribution of Valid Responses')
    plt.ylabel('')
    plt.tight_layout()

    if not os.path.exists(f"{output_path}"):
        os.makedirs(f"{output_path}")
    plt.savefig(f"{output_path}/pie-chart-valid-ontologies.pdf")

    #plt.show()

# 2. Bar chart of each Response Time with Average Line
def bar_chart_response_time(df, output_path: str):
    plt.figure(figsize=(12, 6)) 

    response_times = df['Response Time']
    indices = range(len(response_times)) 

    plt.bar(indices, response_times, color='skyblue', label='Individual Response Times')

    average_response_time = response_times.mean()

    plt.axhline(y=average_response_time, color='red', linestyle='--', linewidth=2, label=f'Average Response Time ({average_response_time:.2f} seconds)')

    plt.title('Individual Response Times with Average Line')
    plt.xlabel('Response Index (Data Point)')
    plt.ylabel('Response Time (seconds)')
    plt.xticks(indices, labels=df['ID'], rotation=45, ha='right') 
    plt.legend()
    plt.tight_layout()
    
    if not os.path.exists(f"{output_path}"):
        os.makedirs(f"{output_path}")
    plt.savefig(f"{output_path}/bar-chart-response-time.pdf")

    #plt.show()

# 4. Grouped Bar Chart for 'Existence' Metrics (Success vs. Fail)
def bar_chart_existence(df, output_path: str):
    existence_cols = [col for col in df.columns if 'Existence' in col and 'Usage' not in col]

    if not existence_cols:
        print("No 'Existence' columns found in the DataFrame to plot.")
        return 

    df_melted_existence = pd.melt(df, value_vars=existence_cols, var_name='Existence Metric', value_name='Status')
    existence_status_counts = df_melted_existence.groupby(['Existence Metric', 'Status']).size().unstack(fill_value=0)

    clean_index = existence_status_counts.index.str.removesuffix('.Existence')
    existence_status_counts.set_index(clean_index, inplace=True)

    num_metrics = len(existence_status_counts)
    fig_height = max(7, num_metrics * 0.25) 
    fig, ax = plt.subplots(figsize=(12, fig_height))

    existence_status_counts.plot(kind='barh', width=0.5, color=['salmon', 'skyblue'], ax=ax)
    
    ax.set_xlabel('Count')
    ax.set_ylabel('Metrics') 
    ax.legend(title='Status', fontsize='small')

    ax.invert_yaxis() 
    
    ax.grid(axis='x', linestyle='--', alpha=0.7)
    ax.tick_params(axis='y', labelsize=13)
    ax.tick_params(axis='x', which='major', labelsize=10)

    fig.tight_layout()

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    save_path = os.path.join(output_path, "bar-chart-existence-horizontal.pdf")
    plt.savefig(save_path)
    print(f"Chart saved to {save_path}")

    #plt.show()

# 5. Box Plot of 'Usage' Metrics
def box_plot_usage(df, output_path: str):
    usage_cols = [col for col in df.columns if 'Usage' in col]

    if not usage_cols:
        print("No 'Usage' columns found in the DataFrame to plot.")
        return

    data_to_plot = df[usage_cols].copy()
    clean_names_map = {col: col.removesuffix('.Usage') for col in usage_cols}
    data_to_plot.rename(columns=clean_names_map, inplace=True)

    num_metrics = len(usage_cols)
    fig_height = max(6, num_metrics * 0.26) 
    fig, ax = plt.subplots(figsize=(6, fig_height)) 

    data_to_plot.boxplot(
        ax=ax,
        vert=False,
        patch_artist=True,
        grid=False
    )

    boxes = ax.patches
    
    for box in boxes:
        box.set_facecolor('lightblue') 

    ax.set_xlabel('Usage Value')
    ax.set_ylabel('')
    ax.tick_params(axis='y', labelsize=13)

    ax.grid(axis='x', linestyle='--', alpha=0.7)
    fig.tight_layout()

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    save_path = os.path.join(output_path, "box-plot-usage-horizontal.pdf")
    plt.savefig(save_path)
    print(f"Chart saved to {save_path}")

    #plt.show()

# 6. Bar Chart of Total "Success" Existence Count per Entry
def bar_chart_success_ontology(df, output_path: str):
    existence_cols = [col for col in df.columns if 'Existence' in col]

    if not existence_cols:
        print("No 'Existence' columns found in the DataFrame to plot.")
        return
    
    usage_sums_per_entry = []
    entry_ids = []

    for index, row in df.iterrows(): 
        usage_sum = 0
        for col in existence_cols:
            if pd.notna(row[col]) and row[col] == "Success":
                usage_sum += 1
        usage_sums_per_entry.append(usage_sum)
        entry_ids.append(row['ID']) 

    plt.figure(figsize=(5, 5)) 
    plt.bar(entry_ids, usage_sums_per_entry, color='lightblue') 
    
    if usage_sums_per_entry:
        average_usage = sum(usage_sums_per_entry) / len(usage_sums_per_entry)
        plt.axhline(y=average_usage, color='red', linestyle='--')

    plt.title('Total Existence Sum per Entry')
    plt.xlabel('Entry ID')
    plt.ylabel('Sum of Usage Values')
    plt.xticks(rotation=45, ha='right') 
    plt.tight_layout()

    if not os.path.exists(f"{output_path}"):
        os.makedirs(f"{output_path}")
    plt.savefig(f"{output_path}/bar-chart-success-ontology.pdf")

    plt.show()

# 7. Bar Chart of Total Usage Sum per Entry
def bar_chart_total_usage_ontology(df, output_path: str):
    usage_cols = [col for col in df.columns if 'Usage' in col]

    if not usage_cols:
        print("No 'Usage' columns found in the DataFrame to plot.")
    else:
        usage_sums_per_entry = []
        entry_ids = []

        for index, row in df.iterrows(): 
            usage_sum = 0
            for col in usage_cols:
                if pd.notna(row[col]):
                    usage_sum += row[col]
            usage_sums_per_entry.append(usage_sum)
            entry_ids.append(row['ID']) 

        plt.figure(figsize=(5, 5)) 
        plt.bar(entry_ids, usage_sums_per_entry, color='lightcoral') 

        if usage_sums_per_entry:
            average_usage = sum(usage_sums_per_entry) / len(usage_sums_per_entry)
            plt.axhline(y=average_usage, color='red', linestyle='--')

        plt.title('Total Usage Sum per Entry')
        plt.xlabel('Entry ID')
        plt.ylabel('Sum of Usage Values')
        plt.xticks(rotation=45, ha='right') 
        plt.tight_layout()
        
        if not os.path.exists(f"{output_path}"):
            os.makedirs(f"{output_path}")
        plt.savefig(f"{output_path}/bar-chart-total-usage-ontology.pdf")

        #plt.show()

def bar_chart_total_presence_ontology(df, output_path: str):
    presence_cols = df.filter(regex=r'\.Presence$').columns.tolist()

    if not presence_cols:
        print("No 'Presence' columns found in the DataFrame to plot.")
    else:
        usage_sums_per_entry = []
        entry_ids = []

        for index, row in df.iterrows(): 
            usage_sum = 0
            for col in presence_cols:
                if pd.notna(row[col]):
                    usage_sum += row[col]
            usage_sums_per_entry.append(usage_sum)
            entry_ids.append(row['ID']) 

        plt.figure(figsize=(5, 5)) 
        plt.bar(entry_ids, usage_sums_per_entry, color='lightblue') 

        if usage_sums_per_entry:
            average_usage = sum(usage_sums_per_entry) / len(usage_sums_per_entry)
            plt.axhline(y=average_usage, color='red', linestyle='--')

        plt.title('Total Presence Sum per Entry')
        plt.xlabel('Entry ID')
        plt.ylabel('Sum of Presence Values')
        plt.xticks(rotation=45, ha='right') 
        plt.tight_layout()
        
        if not os.path.exists(f"{output_path}"):
            os.makedirs(f"{output_path}")
        plt.savefig(f"{output_path}/bar-chart-total-presence-ontology.pdf")

        #plt.show()

def box_plot_presence(df, output_path: str):
    """
    Creates and saves a horizontal box plot for the distribution of each '.Presence' metric.
    These metrics are typically binary (0 or 1).
    """
    presence_cols = df.filter(regex=r'\.Presence$').columns.tolist()

    if not presence_cols:
        print("No '.Presence' columns found in the DataFrame to plot.")
        return

    data_to_plot = df[presence_cols].copy()
    clean_names_map = {col: col.removesuffix('.Presence') for col in presence_cols}
    data_to_plot.rename(columns=clean_names_map, inplace=True)

    num_metrics = len(presence_cols)
    fig_height = max(7, num_metrics * 0.26) 

    fig, ax = plt.subplots(figsize=(6, fig_height)) 

    data_to_plot.boxplot(
        ax=ax,
        vert=False,
        patch_artist=True,
        grid=False
    )
    
    boxes = ax.patches
    for box in boxes:
        box.set_facecolor('lightblue') 

    ax.set_xlabel('Presence Value')
    ax.set_ylabel('')
    ax.tick_params(axis='y', labelsize=13)

    ax.grid(axis='x', linestyle='--', alpha=0.7)
    
    fig.tight_layout()

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    
    save_path = os.path.join(output_path, "box-plot-presence-horizontal.pdf")
    plt.savefig(save_path)
    print(f"Chart saved to {save_path}")

    #plt.show()

### 2.4.6 Run Process

In [11]:
async def run_process(entries: list, output_path: str):
    n_entries = len(entries)
    for i in range(n_entries):
        pdf_file_path = entries[i]["pdf_path"]
        puml_file_path = entries[i]["puml_path"]
        prompt_file_path = entries[i]["prompt_path"]
        test_quantity = entries[i]["test_quantity"]
        service = entries[i]["service"]
        llm = entries[i]["llm_model"]
        llm_name = entries[i]["llm_name"]

        pdf_file_name = pdf_file_path.split("/")[-1].split(".")[0]
        puml_file_name = puml_file_path.split("/")[-1].split(".")[0]
        prompt_file_name = prompt_file_path.split("/")[-1].split(".")[0]
        path_results = f"{output_path}/guided-gen/results/{puml_file_name}/{pdf_file_name}/{prompt_file_name}/{llm_name}"    
        path_charts = f"{path_results}/graphs"
        path_csv = f"{path_results}/result-data.csv"

        print(f"=====================[ Running Test {i+1}/{n_entries} ]=====================")
        print(f"- Pdf file: {pdf_file_name}")
        print(f"- Puml file: {puml_file_name}")
        print(f"- Prompt file: {prompt_file_name}")
        print(f"- Results Output Dir: {path_results}")
        print(f"- Charts Output Dir: {path_charts}")
        print(f"- CSV Output Path: {path_csv}")
        print(f"- Run Count: {test_quantity}")
        print(f"----------------------------------------------------------------")

        if test_quantity > 0:
            generation_results = await bulk_generate(pdf_file_path, puml_file_path, prompt_file_path, test_quantity, service, llm)
            validation_results = await bulk_validate(puml_file_path, generation_results)
            
            try:
                old_data_frame = pd.read_csv(path_csv)
                last_id = old_data_frame["ID"].max()
            except:
                last_id = -1

            save_generation_results(generation_results, path_results, last_id) 
            save_validation_results(validation_results, path_results, last_id)
            new_data_frame = generate_dataframe(pdf_file_name, prompt_file_name, llm, service, generation_results, validation_results, last_id)
            
            try:
                data_frame = pd.concat([old_data_frame, new_data_frame])
            except:
                data_frame = new_data_frame

            data_frame.to_csv(path_csv)
            generate_charts(data_frame, path_charts)

In [None]:
await run_process(entries, "./testing")