Skip to content

Commit

Permalink
fix: allow passing headers to package finder
Browse files Browse the repository at this point in the history
Signed-off-by: Frost Ming <me@frostming.com>
  • Loading branch information
frostming committed May 27, 2024
1 parent bdfcf31 commit 2e29f86
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 17 deletions.
37 changes: 22 additions & 15 deletions src/unearth/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@

from __future__ import annotations

import functools
import ipaddress
import json
import logging
import mimetypes
from datetime import datetime
from html.parser import HTMLParser
from typing import Iterable, NamedTuple
from typing import Iterable, Mapping, NamedTuple
from urllib import parse

from unearth.fetchers import Fetcher, Response
Expand Down Expand Up @@ -149,7 +148,10 @@ def parse_json_response(page: IndexPage) -> Iterable[Link]:


def collect_links_from_location(
session: Fetcher, location: Link, expand: bool = False
session: Fetcher,
location: Link,
expand: bool = False,
headers: Mapping[str, str] | None = None,
) -> Iterable[Link]:
"""Collect package links from a remote URL or local path.
Expand All @@ -165,24 +167,27 @@ def collect_links_from_location(
for child in path.iterdir():
file_url = path_to_url(str(child))
if _is_html_file(file_url):
yield from _collect_links_from_index(session, Link(file_url))
yield from _collect_links_from_index(
session, Link(file_url), headers
)
else:
yield Link(file_url)
else:
index_html = Link(path_to_url(path.joinpath("index.html").as_posix()))
yield from _collect_links_from_index(session, index_html)
yield from _collect_links_from_index(session, index_html, headers)
else:
yield from _collect_links_from_index(session, location)
yield from _collect_links_from_index(session, location, headers)

else:
yield from _collect_links_from_index(session, location)


@functools.lru_cache(maxsize=None)
def fetch_page(session: Fetcher, location: Link) -> IndexPage:
def fetch_page(
session: Fetcher, location: Link, headers: Mapping[str, str] | None = None
) -> IndexPage:
if location.is_vcs:
raise LinkCollectError("It is a VCS link.")
resp = _get_html_response(session, location)
resp = _get_html_response(session, location, headers)
from_cache = getattr(resp, "from_cache", False)
cache_text = " (from cache)" if from_cache else ""
logger.debug("Fetching HTML page %s%s", location.redacted, cache_text)
Expand All @@ -191,11 +196,13 @@ def fetch_page(session: Fetcher, location: Link) -> IndexPage:
)


def _collect_links_from_index(session: Fetcher, location: Link) -> Iterable[Link]:
def _collect_links_from_index(
session: Fetcher, location: Link, headers: Mapping[str, str] | None = None
) -> Iterable[Link]:
if not is_secure_origin(session, location):
return []
try:
page = fetch_page(session, location)
page = fetch_page(session, location, headers)
except LinkCollectError as e:
logger.warning("Failed to collect links from %s: %s", location.redacted, e)
return []
Expand All @@ -211,7 +218,9 @@ def _is_html_file(file_url: str) -> bool:
return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"


def _get_html_response(session: Fetcher, location: Link) -> Response:
def _get_html_response(
session: Fetcher, location: Link, headers: Mapping[str, str] | None = None
) -> Response:
if is_archive_file(location.filename):
# If the URL looks like a file, send a HEAD request to ensure
# the link is an HTML page to avoid downloading a large file.
Expand All @@ -227,9 +236,7 @@ def _get_html_response(session: Fetcher, location: Link) -> Response:
"text/html; q=0.01",
]
),
# Don't cache the /simple/{package} page, to ensure it gets updated
# immediately when a new release is uploaded.
"Cache-Control": "max-age=0",
**(headers or {}),
},
)
_check_for_status(resp)
Expand Down
10 changes: 8 additions & 2 deletions src/unearth/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def __init__(
self.respect_source_order = respect_source_order
self.verbosity = verbosity
self.exclude_newer_than = exclude_newer_than
self.headers: dict[str, str] = {}

self._tag_priorities = {
tag: i for i, tag in enumerate(self.target_python.supported_tags())
Expand Down Expand Up @@ -270,12 +271,17 @@ def find_one_source(source: Source) -> Iterable[Package]:
if source["type"] == "index":
link = self._build_index_page_link(source["url"], package_name)
result = self._evaluate_links(
collect_links_from_location(self.session, link), evaluator
collect_links_from_location(
self.session, link, headers=self.headers
),
evaluator,
)
else:
link = self._build_find_link(source["url"])
result = self._evaluate_links(
collect_links_from_location(self.session, link, expand=True),
collect_links_from_location(
self.session, link, expand=True, headers=self.headers
),
evaluator,
)
if self.respect_source_order:
Expand Down

0 comments on commit 2e29f86

Please sign in to comment.