In [33]:
!pip install httpx langchain langchain-mistralai langchain-groq qdrant-client fastembed sparql-llm



In [34]:
import logging

# Setting up an LLM programmatically and asking question to the model directly

"Zero-short" question answering

In [24]:
from google.colab import userdata
from langchain_core.language_models import BaseChatModel

## 1. Set up LLM provider
def load_chat_model(model: str) -> BaseChatModel:
    provider, model_name = model.split("/", maxsplit=1)
    if provider == "mistralai":
        # https://python.langchain.com/docs/integrations/chat/mistralai/
        from langchain_mistralai import ChatMistralAI

        return ChatMistralAI(
            model=model_name,
            temperature=0,
            max_tokens=1024,
            api_key=userdata.get('MISTRAL_API_KEY'),
        )
    if provider == "groq":
        # https://python.langchain.com/docs/integrations/chat/groq/
        from langchain_groq import ChatGroq

        return ChatGroq(
            model=model_name,
            temperature=0,
            max_tokens=1024,
            api_key=userdata.get('GROQ_API_KEY'),
        )
    raise ValueError(f"Unknown provider: {provider}")

# llm = load_chat_model("groq/mistral-small-latest")
llm = load_chat_model("mistralai/mistral-small-latest")
# llm = load_chat_model("groq/meta-llama/llama-4-scout-17b-16e-instruct")

# zero-shot: asking an LLM a question "out-of-the-box"
def main():
    question = "Which resources are available at the SIB?"
    resp = llm.invoke(question)
    direct_output = resp.text.replace("\\n", "\n")
    print(direct_output)

main()

The **Swiss Institute of Bioinformatics (SIB)** is a leading organization in bioinformatics and computational biology, offering a wide range of resources, tools, and services. Here are some key resources available at SIB:

### **1. Bioinformatics Tools & Databases**
SIB hosts and develops numerous bioinformatics tools and databases, including:
- **ExPASy** – A portal for protein sequence analysis (e.g., **Protein Identifier**, **Swiss-Prot**, **UniProt**).
- **Swiss-Prot** – A curated protein sequence database.
- **UniProt** – A comprehensive protein database (part of the UniProt Consortium).
- **SwissModel** – A protein structure modeling server.
- **Bgee** – A database for gene expression across species.
- **Ensembl** – A genome browser for vertebrate genomes (co-hosted with EMBL-EBI).
- **STRING** – A protein-protein interaction network database.
- **PhylomeDB** – A database of evolutionary histories of genes.
- **Gene Ontology (GO)** – A framework for gene function annotation.

###

# Next step: providing context

In [42]:
import httpx
SYSTEM_PROMPT = """You are an assistant that helps users to navigate the resources and databases from the SIB Swiss Institute of Bioinformatics.
Here is the description of resources available at the SIB:
{context}
Use it to answer the question"""

question = "Which resources are available at the SIB?"
response = httpx.get("https://github.com/sib-swiss/sparql-llm/raw/refs/heads/main/expasy_resources_metadata.csv", follow_redirects=True)

messages = [
    ("system", SYSTEM_PROMPT.format(context=response.text)),
    ("human", question),
]

# Stream the output and print metadata (e.g. number of tokens) if available
for resp in llm.stream(messages):
    print(resp.content, end="", flush=True)
    if resp.usage_metadata:
        print(f"\n\n{resp.usage_metadata}")

The SIB Swiss Institute of Bioinformatics offers a wide range of resources across various domains. Here are some of the key resources available:

1. **UniProtKB**: A comprehensive protein sequence database that includes expert-curated records (Swiss-Prot) and computationally annotated records (TrEMBL).

2. **Selectome**: A database of positive selection based on a rigorous branch-site specific likelihood test.

3. **PROSITE**: A database of protein domains, families, and functional sites, along with associated patterns and profiles to identify them.

4. **SWISS-MODEL Repository**: A database of protein structure homology models generated by the fully automated SWISS-MODEL modeling pipeline.

5. **SwissDock**: A protein-ligand docking server based on the Attracting cavities and Autodock Vina docking engines.

6. **ENZYME**: A repository of information relative to the nomenclature of enzymes.

7. **HAMAP**: A system for the classification and annotation of protein sequences.

8. **ViralZ

# How about a *really long* context?

In [9]:
import httpx
SYSTEM_PROMPT = """You are an assistant that helps users to navigate the resources and databases from the SIB Swiss Institute of Bioinformatics.
Here is the description of resources available at the SIB:
{context}
Use it to answer the question"""

question = "Which resources are available?"
response = httpx.get("https://github.com/sib-swiss/sparql-llm/blob/main/expasy_resources_metadata.csv", follow_redirects=True)
messages = [
    ("system", SYSTEM_PROMPT.format(context=response.text)),
    ("human", question),
]
for resp in llm.stream(messages):
    print(resp.content, end="", flush=True)
    if resp.usage_metadata:
         print(f"\n\n{resp.usage_metadata}")

HTTPStatusError: Error response 400 while fetching https://api.mistral.ai/v1/chat/completions: {"object":"error","message":"Prompt contains 146941 tokens and 0 draft tokens, too large for model with 131072 maximum context length","type":"invalid_request_invalid_args","param":null,"code":"3051"}

# When general context is too long, **embeddings** can be used to retrieve only the parts relevant to a user question, using a Vector Database, such as Qdrant.

# Embeddings are created once, at system setup, and are subsequently used to retrieve relevant context whenever needed.

In [4]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient

## 2. Set up vector database for document retrieval
embedding_model = TextEmbedding("BAAI/bge-small-en-v1.5")
embedding_dimensions = 384
collection_name = "sparql-docs"
vectordb = QdrantClient(path="data/vectordb")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

In [35]:
from langchain_core.documents import Document
from qdrant_client.http.models import Distance, VectorParams
from sparql_llm import SparqlExamplesLoader, SparqlVoidShapesLoader, SparqlInfoLoader

## 2. Set up vector database for document retrieval
endpoints: list[dict[str, str]] = [
    { "endpoint_url": "https://sparql.uniprot.org/sparql/" },
    { "endpoint_url": "https://www.bgee.org/sparql/" },
    { "endpoint_url": "https://sparql.omabrowser.org/sparql/" },
]

def index_endpoints():
    """Index SPARQL endpoints metadata in the vector database."""
    docs: list[Document] = []
    # Fetch documents from endpoints
    for endpoint in endpoints:
        print(f"🔎 Retrieving metadata for {endpoint['endpoint_url']}")
        docs += SparqlExamplesLoader(
            endpoint["endpoint_url"],
            examples_file=endpoint.get("examples_file"),
        ).load()
        docs += SparqlVoidShapesLoader(
            endpoint["endpoint_url"],
            void_file=endpoint.get("void_file"),
            examples_file=endpoint.get("examples_file"),
        ).load()
    docs += SparqlInfoLoader(endpoints, source_iri="https://www.expasy.org/").load()

    # Load documents in vectordb
    if vectordb.collection_exists(collection_name):
        vectordb.delete_collection(collection_name)
    vectordb.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embedding_dimensions, distance=Distance.COSINE),
    )
    embeddings = embedding_model.embed([q.page_content for q in docs])
    vectordb.upload_collection(
        collection_name=collection_name,
        vectors=[embed.tolist() for embed in embeddings],
        payload=[doc.metadata for doc in docs],
    )
    print(f"✅ Indexed {len(docs)} documents in collection {collection_name}")



if not vectordb.collection_exists(collection_name) or vectordb.get_collection(collection_name).points_count == 0:
    index_endpoints()
else:
    print(
        f"ℹ️  Using existing collection '{collection_name}' with {vectordb.get_collection(collection_name).points_count} vectors"
    )

ℹ️  Using existing collection 'sparql-docs' with 619 vectors


In [36]:
from qdrant_client.models import FieldCondition, Filter, MatchValue, ScoredPoint


## 3. Set up document retrieval and system prompt
retrieved_docs_count = 3
def retrieve_docs(question: str) -> list[ScoredPoint]:
    """Retrieve documents relevant to the user's question."""
    question_embeddings = next(iter(embedding_model.embed([question])))
    retrieved_docs = vectordb.query_points(
        collection_name=collection_name,
        query=question_embeddings,
        limit=retrieved_docs_count,
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="doc_type",
                    match=MatchValue(value="SPARQL endpoints query examples"),
                )
            ]
        ),
    ).points
    retrieved_docs += vectordb.query_points(
        collection_name=collection_name,
        query=question_embeddings,
        limit=retrieved_docs_count,
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="doc_type",
                    match=MatchValue(value="SPARQL endpoints classes schema"),
                )
            ]
        ),
    ).points
    return retrieved_docs

In [37]:
## 3. Set up document retrieval and system prompt
def format_doc(doc: ScoredPoint) -> str:
    """Format a question/answer document to be provided as context to the model."""
    doc_lang = (
        f"sparql\n#+ endpoint: {doc.payload.get('endpoint_url', 'not provided')}"
        if "query" in doc.payload.get("doc_type", "")
        else ""
    )
    return f"\n{doc.payload['question']} ({doc.payload.get('endpoint_url', '')}):\n\n```{doc_lang}\n{doc.payload.get('answer')}\n```\n\n"

In [38]:
SYSTEM_PROMPT = """You are an assistant that helps users to write SPARQL queries.
Put the SPARQL query inside a markdown codeblock with the "sparql" language tag, and always add the URL of the endpoint on which the query should be executed in a comment at the start of the query inside the codeblocks starting with "#+ endpoint: " (always only 1 endpoint).
Use the queries examples and classes shapes provided in the prompt to derive your answer, don't try to create a query from nothing and do not provide a generic query.
Try to always answer with one query, if the answer lies in different endpoints, provide a federated query.
And briefly explain the query.
Here is a list of documents (reference questions and query answers, classes schema) relevant to the user question that will help you answer the user question accurately:
{relevant_docs}
"""

In [39]:
def main():
    question = "What are the rat orthologs of human TP53?"
    retrieved_docs = retrieve_docs(question)
    formatted_docs = "\n".join(format_doc(doc) for doc in retrieved_docs)
    messages = [
        ("system", SYSTEM_PROMPT.format(relevant_docs=formatted_docs)),
        ("user", question),
    ]
    for resp in llm.stream(messages):
        print(resp.content, end="", flush=True)
        if resp.usage_metadata:
            print("\n")
            print(f"🎰 {resp.usage_metadata}")
main()

To find the rat orthologs of the human TP53 gene, we can use the OMA browser SPARQL endpoint. The query will look for orthologs clusters that include both human and rat proteins, and then filter for the specific TP53 protein in humans.

```sparql
#+ endpoint: https://sparql.omabrowser.org/sparql/
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>

SELECT DISTINCT ?rat_ortholog ?rat_uniprot_id WHERE {
  ?cluster a orth:OrthologsCluster ;
           orth:hasHomologousMember ?human_node ;
           orth:hasHomologousMember ?rat_node .

  ?human_node orth:hasHomologousMember* ?human_protein .
  ?rat_node orth:hasHomologousMember* ?rat_ortholog .

  ?human_protein a orth:Protein ;
                 orth:organism/obo:RO_0002162/up:scientificName "Homo sapiens" ;
                 rdfs:label "TP53" ;
              

In [40]:
from sparql_llm.validate_sparql import extract_sparql_queries
from sparql_llm.utils import query_sparql

## 4. Execute generated SPARQL query
def execute_query(last_msg: str) -> list[dict[str, str]]:
    """Extract SPARQL query from markdown and execute it."""
    for extracted_query in extract_sparql_queries(last_msg):
        if extracted_query.get("query") and extracted_query.get("endpoint_url"):
            res = query_sparql(extracted_query.get("query"), extracted_query.get("endpoint_url"))
            return res.get("results", {}).get("bindings", [])

In [41]:
import json

max_try_count = 3

def main():
    question = "What are the rat orthologs of human TP53?"
    # Retrieve relevant documents and add them to conversation
    retrieved_docs = retrieve_docs(question)
    formatted_docs = "\n".join(format_doc(doc) for doc in retrieved_docs)
    messages = [
        ("system", SYSTEM_PROMPT.format(relevant_docs=formatted_docs)),
        ("user", question),
    ]
    # Loop until query execution is successful or max tries reached
    query_success = False
    for _i in range(max_try_count):
        complete_answer = ""
        for resp in llm.stream(messages):
            print(resp.content, end="", flush=True)
            complete_answer += resp.content
            if resp.usage_metadata:
                print("\n")
                logging.info(f"🎰 {resp.usage_metadata}")

        messages.append(("assistant", complete_answer))
        if query_success:
            break

        # Run execution on the final answer
        query_res = execute_query(complete_answer)
        if len(query_res) < 1:
            logging.warning("⚠️ No results, trying to fix")
            messages.append(("user", f"""The query you provided returned no results, please fix the query:\n\n{complete_answer}"""))
        else:
            logging.info(f"✅ Got {len(query_res)} results, summarizing them")
            messages.append(("user", f"""The query you provided returned these results, summarize them:\n\n{json.dumps(query_res, indent=2)}"""))
            query_success = True

main()

To find the rat orthologs of the human TP53 gene, we can use the OMA browser endpoint to query for orthologous proteins between humans and rats. The query will look for proteins that are part of the same orthologous cluster as the human TP53 protein and are expressed in rats.

Here is the SPARQL query to achieve this:

```sparql
#+ endpoint: https://sparql.omabrowser.org/sparql/
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX lscr: <http://purl.org/lscr#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>

SELECT DISTINCT ?RAT_PROTEIN ?RAT_UNIPROT_XREF WHERE {
    {
        ?cluster a orth:OrthologsCluster .
        ?cluster orth:hasHomologousMember ?node1 .
        ?cluster orth:hasHomologousMember ?node2 .
        ?node1 orth:hasHomologousMember* ?HUMAN_PROTEIN .
        ?node2 orth:hasHomolog