In [None]:
import os
import rdflib
from dotenv import load_dotenv
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
entities = []

g = rdflib.Graph()

def fetch_sparql_results():
    sparql_result = []
    offset = 0
    sparql_batch_size = 100  # Batch size for SPARQL queries
    more_results = True

    while more_results:
        try:
            sparql_query = f"""
                            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                            select ?label where {{
                                SERVICE <https://nubbekg.aksw.org/sparql> {{
                                    ?s a <http://nubbekg.aksw.org/ontology#IsolationSite> .
                                    ?s rdfs:label ?label .
                                }}
                            }} LIMIT {sparql_batch_size} OFFSET {offset}
                            """
            current_results = g.query(sparql_query)
            current_results = list(current_results)
            if not current_results or len(current_results) < sparql_batch_size:
                more_results = False
            else:
                offset += sparql_batch_size
                sparql_result.extend(current_results)
        except Exception as e:
            print("Exception when querying SPARQL endpoint" + ": %s" % e)
            break

    return sparql_result

sparq_result = fetch_sparql_results()

for entry in sparq_result:
    label = entry.label
    doc = Document(page_content=label, metadata={'label': label})
    entities.append(doc)

# Create FAISS index from documents
faiss_index = FAISS.from_documents(entities, embeddings)

# Save the FAISS index locally
faiss_index.save_local("faiss_index")

# Load the FAISS index, allowing deserialization
loaded_faiss_index = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

# Perform a similarity search
query = "Minas Gerais"
docs_with_score = loaded_faiss_index.similarity_search_with_score(query, top_k=5)

for doc, score in docs_with_score:
    print(f"Document: {doc.page_content}, Score: {score}")

In [9]:
import os
from dotenv import load_dotenv
import rdflib
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Load environment variables from .env file
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

attributes = {
    "Compound": "http://nubbekg.aksw.org/ontology#Compound",
    "Bioactivity": "http://nubbekg.aksw.org/ontology#Bioactivity",
    "Species": "http://nubbekg.aksw.org/ontology#Species",
    "IsolationSite": "http://nubbekg.aksw.org/ontology#IsolationSite",
    "IsolationType": "http://nubbekg.aksw.org/ontology#IsolationType"
}

g = rdflib.Graph()

def fetch_sparql_results(attribute_uri):
    sparql_result = []
    offset = 0
    sparql_batch_size = 100  # Batch size for SPARQL queries
    more_results = True

    while more_results:
        try:
            sparql_query = f"""
                            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                            select ?label where {{
                                SERVICE <https://nubbekg.aksw.org/sparql> {{
                                    ?s a <{attribute_uri}> .
                                    ?s rdfs:label ?label .
                                }}
                            }} LIMIT {sparql_batch_size} OFFSET {offset}
                            """
            current_results = g.query(sparql_query)
            current_results = list(current_results)
            if not current_results or len(current_results) < sparql_batch_size:
                more_results = False
            else:
                offset += sparql_batch_size
                sparql_result.extend(current_results)
        except Exception as e:
            print(f"Exception when querying SPARQL endpoint for {attribute_uri}: {e}")
            break

    return sparql_result

# Ensure the faiss_index directory exists
os.makedirs("faiss_index", exist_ok=True)

for attribute_name, attribute_uri in attributes.items():
    entities = []
    print(f"Fetching results for {attribute_name}...")
    sparql_results = fetch_sparql_results(attribute_uri)

    for entry in sparql_results:
        label = entry.label
        doc = Document(page_content=label, metadata={'label': label})
        entities.append(doc)

    if entities:
        # Create FAISS index from documents
        faiss_index = FAISS.from_documents(entities, embeddings)

        # Save the FAISS index locally with attribute-specific naming
        index_path = os.path.join("faiss_index", f"faiss_index_{attribute_name}")
        faiss_index.save_local(index_path)

        # Load the FAISS index, allowing deserialization
        loaded_faiss_index = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

        # Perform a similarity search
        query = "Minas Gerais"
        docs_with_score = loaded_faiss_index.similarity_search_with_score(query, top_k=5)

        print(f"Results for {attribute_name}:")
        for doc, score in docs_with_score:
            print(f"Document: {doc.page_content}, Score: {score}")
        print("\n")
    else:
        print(f"No results found for {attribute_name}\n")


Fetching results for Compound...
No results found for Compound

Fetching results for Bioactivity...
No results found for Bioactivity

Fetching results for Species...
Results for Species:
Document: Brosimum paraense, Score: 0.3206876218318939
Document: Hortia brasiliana, Score: 0.36012303829193115
Document: Tovomita brasiliensis, Score: 0.36122336983680725
Document: Strychnos brasiliensis, Score: 0.3688787519931793


Fetching results for IsolationSite...
Results for IsolationSite:
Document: Minas Novas ,MG, Score: 0.12228038907051086
Document: Brasilandia De Minas ,MG, Score: 0.13958212733268738
Document: Brasilia De Minas ,MG, Score: 0.14308050274848938
Document: Maripa De Minas ,MG, Score: 0.15157777070999146


Fetching results for IsolationType...
No results found for IsolationType

