Skip to content

Commit

Permalink
Get last update from last modified header
Browse files Browse the repository at this point in the history
  • Loading branch information
moshe committed Jun 23, 2024
1 parent 6c71bc0 commit bdfa290
Showing 1 changed file with 12 additions and 0 deletions.
12 changes: 12 additions & 0 deletions backend/danswer/connectors/web/connector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io
import ipaddress
import socket
from datetime import datetime
from enum import Enum
from typing import Any
from typing import cast
Expand Down Expand Up @@ -175,6 +176,13 @@ def _read_urls_file(location: str) -> list[str]:
return urls


def _get_datetime_from_last_modified_header(lase_modified: str) -> datetime | None:
try:
return datetime.strptime(lase_modified, "%a, %d %b %Y %H:%M:%S %Z")
except (ValueError, TypeError):
return None


class WebConnector(LoadConnector):
def __init__(
self,
Expand Down Expand Up @@ -258,6 +266,7 @@ def load_from_state(self) -> GenerateDocumentsOutput:
# PDF files are not checked for links
response = requests.get(current_url)
page_text = pdf_to_text(file=io.BytesIO(response.content))
last_modified = response.headers.get("Last-Modified")

doc_batch.append(
Document(
Expand All @@ -266,12 +275,14 @@ def load_from_state(self) -> GenerateDocumentsOutput:
source=DocumentSource.WEB,
semantic_identifier=current_url.split(".")[-1],
metadata={},
doc_updated_at=_get_datetime_from_last_modified_header(last_modified),
)
)
continue

page = context.new_page()
page_response = page.goto(current_url)
last_modified = page_response.header_value("Last-Modified")
final_page = page.url
if final_page != current_url:
logger.info(f"Redirected to {final_page}")
Expand Down Expand Up @@ -307,6 +318,7 @@ def load_from_state(self) -> GenerateDocumentsOutput:
source=DocumentSource.WEB,
semantic_identifier=parsed_html.title or current_url,
metadata={},
doc_updated_at=_get_datetime_from_last_modified_header(last_modified),
)
)

Expand Down

0 comments on commit bdfa290

Please sign in to comment.