Skip to content

Commit

Permalink
Handle classification failures for semantic enforcement
Browse files Browse the repository at this point in the history
  • Loading branch information
Raj725 committed Jul 10, 2024
1 parent 1e1fd30 commit 66f6b55
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from langchain_core.pydantic_v1 import BaseModel

from langchain_community.utilities.pebblo import CLASSIFICATION_UNAVAILABLE


class AuthContext(BaseModel):
"""Class for an authorization context."""
Expand Down Expand Up @@ -46,6 +48,14 @@ def __init__(self, **data: Any) -> None:
"'pebblo_semantic_topics'"
)

# Add CLASSIFICATION_UNAVAILABLE to deny list if it's not empty
if self.pebblo_semantic_entities and self.pebblo_semantic_entities.deny:
if CLASSIFICATION_UNAVAILABLE not in self.pebblo_semantic_entities.deny:
self.pebblo_semantic_entities.deny.append(CLASSIFICATION_UNAVAILABLE)
if self.pebblo_semantic_topics and self.pebblo_semantic_topics.deny:
if CLASSIFICATION_UNAVAILABLE not in self.pebblo_semantic_topics.deny:
self.pebblo_semantic_topics.deny.append(CLASSIFICATION_UNAVAILABLE)


class ChainInput(BaseModel):
"""Input for PebbloRetrievalQA chain."""
Expand Down
31 changes: 21 additions & 10 deletions libs/community/langchain_community/document_loaders/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.utilities.pebblo import (
APP_DISCOVER_URL,
CLASSIFICATION_UNAVAILABLE,
CLASSIFIER_URL,
LOADER_DOC_URL,
PEBBLO_CLOUD_URL,
Expand Down Expand Up @@ -478,10 +479,15 @@ def _add_semantic_to_docs(
for doc in docs_with_id
}

for classified_doc in classified_docs:
doc_id = classified_doc.get("id")
if doc_id in indexed_docs:
self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc)
if not classified_docs:
# Add CLASSIFICATION_UNAVAILABLE to semantic metadata if no classification
for _, doc in indexed_docs.items():
self._add_semantic_to_doc(doc, {})
else:
for classified_doc in classified_docs:
doc_id = classified_doc.get("id")
if doc_id in indexed_docs:
self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc)

semantic_metadata_docs = [doc for doc in indexed_docs.values()]

Expand All @@ -506,6 +512,7 @@ def _unindex_docs(self, docs_with_id: List[IndexedDocument]) -> List[Document]:
def _add_semantic_to_doc(self, doc: Document, classified_doc: dict) -> Document:
"""
Adds semantic metadata to the given document in-place.
If classified_doc is empty, adds "unavailable" to semantic data.
Args:
doc (Document): A Document object.
Expand All @@ -514,12 +521,16 @@ def _add_semantic_to_doc(self, doc: Document, classified_doc: dict) -> Document:
Returns:
Document: The Document object with added semantic metadata.
"""
doc.metadata["pebblo_semantic_entities"] = list(
classified_doc.get("entities", {}).keys()
)
doc.metadata["pebblo_semantic_topics"] = list(
classified_doc.get("topics", {}).keys()
)
if classified_doc:
doc.metadata["pebblo_semantic_entities"] = list(
classified_doc.get("entities", {}).keys()
)
doc.metadata["pebblo_semantic_topics"] = list(
classified_doc.get("topics", {}).keys()
)
else:
doc.metadata["pebblo_semantic_entities"] = [CLASSIFICATION_UNAVAILABLE]
doc.metadata["pebblo_semantic_topics"] = [CLASSIFICATION_UNAVAILABLE]
return doc

def _add_pebblo_specific_metadata(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions libs/community/langchain_community/utilities/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
}

SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)
CLASSIFICATION_UNAVAILABLE = "unavailable"

logger = logging.getLogger(__name__)

Expand Down

0 comments on commit 66f6b55

Please sign in to comment.