In [1]:
import os
import numpy as np
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import openai

load_dotenv()

True

In [2]:
# %%
# Configure API settings
os.environ["TENTRIS_BASE_URL_EMBEDDINGS"] = os.getenv("TENTRIS_BASE_URL_EMBEDDINGS")
os.environ["TENTRIS_API_KEY"] = os.getenv("TENTRIS_API_KEY")
os.environ["TENTRIS_BASE_URL_CHAT"] = os.getenv("TENTRIS_BASE_URL_CHAT")


In [3]:
# %% [markdown]
# ## 2. Embedding Functions

# %%
def embed_documents(texts):
    client = openai.OpenAI(
        base_url=os.getenv("TENTRIS_BASE_URL_EMBEDDINGS"),
        api_key=os.getenv("TENTRIS_API_KEY"),
        timeout=60
    )
    responses = client.embeddings.create(input=texts, model="tentris")
    return [data.embedding for data in responses.data]

def embed_query(text):
    client = openai.OpenAI(
        base_url=os.getenv("TENTRIS_BASE_URL_EMBEDDINGS"),
        api_key=os.getenv("TENTRIS_API_KEY"),
        timeout=60
    )
    response = client.embeddings.create(input=[text], model="tentris")
    return response.data[0].embedding

In [27]:
loader = TextLoader("../data/speech.txt")  # Update path as needed
documents = loader.load()

print(f"Loaded {len(documents)} document(s)")
print("First document sample:")
print(documents[0].page_content[:200] + "...")

Loaded 1 document(s)
First document sample:
Simon Bin is a person who was affiliated with DICE Research. 
He has worked on the DAIKIRI project and is listed as an alumnus. 
Simon's email is sbin@informatik.uni-leipzig.de, and his chat handle is...


In [28]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200
)
split_docs = text_splitter.split_documents(documents)[:2]  # <-- Only keep first 2 chunks

print(f"Using first {len(split_docs)} chunks only")
print("\nChunk 1:")
print(split_docs[0].page_content[:])
print("\nChunk 2:")
print(split_docs[1].page_content[:] + "...")

Using first 2 chunks only

Chunk 1:
Simon Bin is a person who was affiliated with DICE Research. 
He has worked on the DAIKIRI project and is listed as an alumnus. 
Simon's email is sbin@informatik.uni-leipzig.de, and his chat handle is @sbin:chat.dice-research.org. 
His phone number is +49-341-22903736, and his office is located in Leipzig.
Simon Bin is a person who was affiliated with DICE Research. He has worked on the DAIKIRI project and is listed as an alumnus. Simon's email is sbin@informatik.uni-leipzig.de, and his chat handle is @sbin:chat.dice-research.org. His phone number is +49-341-22903736, and his office is located in Leipzig.
Jan Reineke is an alumnus of DICE Research and holds the position of Person. His office is located at TP6.3.307. His email address is jan.reineke@uni-paderborn.de, and his chat handle is @jarei:chat.dice-research.org. His phone number is +49-525-16-05190, and his photo is available at reineke.jpg.

Chunk 2:
RenÃƒÂ© Speck is a research staff member a

In [5]:
index_path = "faiss_index"

if os.path.exists(index_path):
    print("Loading existing index...")
    db = FAISS.load_local(
        index_path,
        embed_query,
        allow_dangerous_deserialization=True
    )
else:
    print("Creating new index...")
    # Get text content for embedding
    texts = [doc.page_content for doc in split_docs]
    
    # Generate embeddings
    print("Generating embeddings...")
    embeddings = embed_documents(texts)
    
    # Create FAISS index
    db = FAISS.from_embeddings(
        text_embeddings=list(zip(texts, embeddings)),
        embedding=embed_query
    )
    db.save_local(index_path)

print("Vector store ready!")


Loading existing index...


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


Vector store ready!


In [11]:
# ## 5. Similarity Search Debugging

# %%
# Test question
test_question = "What is the main theme of the document?"

# Perform similarity search
similar_docs = db.similarity_search(test_question, k=3)

print(f"Found {len(similar_docs)} relevant chunks:")
for i, doc in enumerate(similar_docs, 1):
    print(f"\nChunk {i} ({len(doc.page_content)} characters):")
    print(doc.page_content[:200] + "...")

Found 3 relevant chunks:

Chunk 1 (874 characters):
Person Profile: Farshad Afshari
schema1:namePrefix: 
schema1:role: dice:Infrastructure
schema1:phone: tel:
schema1:fax: tel:
schema1:email: ns1:uni-paderborn.de
schema1:chat: @afshari:chat.dice-resear...

Chunk 2 (361 characters):
Person Profile: Ajay Kumar
schema1:project: dice:ClimatebOWL
schema1:role: dice:StudentResearcher
schema1:photo: ajayKumar.jpg
schema1:content:  I am a computer science student interested in integrati...

Chunk 3 (936 characters):
Additional Information:
# Bio

Kunal is a  Masters Student (Computer Science) at University of Bonn and works as a Research Assistant at AKSW in Leipzig. Before joining AKSW, Kunal has completed Bac...


In [12]:
# ## 6. Prompt Construction

# %%
# Build context
context = "\n\n".join([doc.page_content for doc in similar_docs])
prompt = f"""Context: {context}

Question: {test_question}

Answer:"""

print("Constructed prompt:")
print(prompt[:1500] + "..." if len(prompt) > 1500 else prompt)

Constructed prompt:
Context: Person Profile: Farshad Afshari
schema1:namePrefix: 
schema1:role: dice:Infrastructure
schema1:phone: tel:
schema1:fax: tel:
schema1:email: ns1:uni-paderborn.de
schema1:chat: @afshari:chat.dice-research.org
schema1:office: FU.201.4
schema1:photo: afshari.jpg
schema1:project: dice:FROCKG
schema1:project: dice:NEBULA
schema1:project: dice:ENEXA
schema1:content: 
  <p>
  <b>Linkedin </b><a href="https://www.linkedin.com/in/afsharifarshad/"> &ensp;Farshad Afshari </a>
  </p>
  <hr/>
  <p>
  <b>Personal website </b><a href="https://farshadafshari.com/"> &ensp;Farshad Afshari website </a>
  </p>
  

Additional Information:

  <p>
  <b>Linkedin </b><a href="https://www.linkedin.com/in/afsharifarshad/"> &ensp;Farshad Afshari </a>
  </p>
  <hr/>
  <p>
  <b>Personal website </b><a href="https://farshadafshari.com/"> &ensp;Farshad Afshari website </a>
  </p>

Person Profile: Ajay Kumar
schema1:project: dice:ClimatebOWL
schema1:role: dice:StudentResearcher
schema1:phot

In [13]:
# ## 7. Query Execution

# %%
def get_chat_response(prompt):
    client = openai.OpenAI(
        base_url=os.getenv("TENTRIS_BASE_URL_CHAT"),
        api_key=os.getenv("TENTRIS_API_KEY")
    )
    response = client.chat.completions.create(
        model="tentris",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content

# %%
# Get final answer
answer = get_chat_response(prompt)
print("Final answer:")
print(answer)

Final answer:
The main theme of the document is the profiles of Farshad Afshari and Ajay Kumar, detailing their roles, contact information, projects, and areas of interest within the context of the DICE research group at the University of Paderborn. Additionally, there is a brief bio provided for Kunal, who is a Masters Student at the University of Bonn and a Research Assistant at AKSW in Leipzig.


In [None]:
"""
🛠️ How It Works
Loads RDF (.ttl) files from a specified folder.
Parses the RDF graph using rdflib.Graph().
Extracts subject-predicate-object triples.
Formats entities:
URIs → Shortened (last part of the URL)
Literals → Kept as-is
Saves processed text in a .txt file for RAG applications.

"""





%pip install rdflib tqdm

import os
import rdflib
from rdflib.namespace import RDF, RDFS, FOAF, OWL, XSD
from tqdm import tqdm

class RDFParser:
    def __init__(self, directory: str, output_file: str):
        """
        Initializes the RDF parser.
        :param directory: Path to folder containing RDF/Turtle files.
        :param output_file: Path to save extracted text.
        """
        self.directory = directory
        self.output_file = output_file
        self.graph = rdflib.Graph()

    def load_rdf_files(self):
        """Loads all .ttl files from the given directory into an RDF graph."""
        ttl_files = [f for f in os.listdir(self.directory) if f.endswith(".ttl")]
        print(f"Loading {len(ttl_files)} RDF files...")

        for file in tqdm(ttl_files, desc="Parsing RDF Files"):
            file_path = os.path.join(self.directory, file)
            self.graph.parse(file_path, format="turtle")

        print(f"✅ Loaded {len(self.graph)} triples from RDF files.")

    def extract_triples(self):
        """Extracts triples and converts them into readable text format."""
        extracted_text = []
        for s, p, o in tqdm(self.graph, desc="Extracting Triples"):
            s_text = self.format_entity(s)
            p_text = self.format_entity(p)
            o_text = self.format_entity(o)

            extracted_text.append(f"{s_text} {p_text} {o_text}.")

        print(f"✅ Extracted {len(extracted_text)} textual triples.")
        return extracted_text

    def format_entity(self, entity):
        """Formats an RDF entity: shortens URIs or keeps literals."""
        if isinstance(entity, rdflib.URIRef):
            return entity.split("/")[-1]  # Get the last part of the URI
        elif isinstance(entity, rdflib.Literal):
            return str(entity)  # Keep literals as-is
        return entity.n3()

    def save_text(self, text_data):
        """Saves extracted text to a file."""
        with open(self.output_file, "w", encoding="utf-8") as f:
            f.write("\n".join(text_data))
        print(f"✅ Processed RDF data saved to {self.output_file}")

    def run(self):
        """Runs the entire pipeline: Load, Extract, Save."""
        self.load_rdf_files()
        text_data = self.extract_triples()
        self.save_text(text_data)


# Example Usage
if __name__ == "__main__":
    rdf_parser = RDFParser(directory="../dice-website/data/people/", output_file="rdf_extracted.txt")
    rdf_parser.run()



[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
Loading 104 RDF files...


Parsing RDF Files: 100%|██████████| 104/104 [00:02<00:00, 36.58it/s]


✅ Loaded 1165 triples from RDF files.


Extracting Triples: 100%|██████████| 1165/1165 [00:00<00:00, 129577.41it/s]

✅ Extracted 1165 textual triples.
✅ Processed RDF data saved to rdf_extracted.txt



