Skip to content

Commit 9be2632

Browse files
authored
Merge 18a4977 into 9c6df79
2 parents 9c6df79 + 18a4977 commit 9be2632

File tree

3 files changed

+38
-12
lines changed

3 files changed

+38
-12
lines changed

haystack/components/fetchers/link_content.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222

2323
logger = logging.getLogger(__name__)
2424

25-
2625
DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}"
2726

2827
REQUEST_HEADERS = {
@@ -33,6 +32,25 @@
3332
}
3433

3534

35+
def _merge_headers(*args: dict[str, str]) -> dict[str, str]:
36+
"""
37+
Merge a list of dict using case-insensitively
38+
39+
:param args: a list of dict to merge
40+
:returns: The merged dict
41+
"""
42+
merged = {}
43+
keymap = {}
44+
45+
for d in args:
46+
for k, v in d.items():
47+
kl = k.lower()
48+
keymap[kl] = k
49+
merged[kl] = v
50+
51+
return {keymap[kl]: v for kl, v in merged.items()}
52+
53+
3654
def _text_content_handler(response: httpx.Response) -> ByteStream:
3755
"""
3856
Handles text content.
@@ -169,17 +187,24 @@ def __init__( # pylint: disable=too-many-positional-arguments
169187
after=self._switch_user_agent,
170188
)
171189
def get_response(url):
172-
# Build headers with precedence:
173-
# client defaults -> component defaults -> user-provided -> rotating UA
174-
base = dict(self._client.headers)
175-
headers = {**base, **REQUEST_HEADERS, **self.request_headers}
176-
headers["User-Agent"] = self.user_agents[self.current_user_agent_idx] # rotation wins
177-
response = self._client.get(url, headers=headers)
190+
response = self._client.get(url, headers=self._get_headers())
178191
response.raise_for_status()
179192
return response
180193

181194
self._get_response: Callable = get_response
182195

196+
def _get_headers(self):
197+
"""
198+
Build headers with precedence
199+
200+
client defaults -> component defaults -> user-provided -> rotating UA
201+
"""
202+
base = dict(self._client.headers)
203+
headers = _merge_headers(
204+
base, REQUEST_HEADERS, self.request_headers, {"User-Agent": self.user_agents[self.current_user_agent_idx]}
205+
)
206+
return headers
207+
183208
def __del__(self):
184209
"""
185210
Clean up resources when the component is deleted.
@@ -378,10 +403,7 @@ async def _get_response_async(self, url: str, client: httpx.AsyncClient) -> http
378403

379404
while attempt <= self.retry_attempts:
380405
try:
381-
base = dict(client.headers)
382-
headers = {**base, **REQUEST_HEADERS, **self.request_headers}
383-
headers["User-Agent"] = self.user_agents[self.current_user_agent_idx]
384-
response = await client.get(url, headers=headers)
406+
response = await client.get(url, headers=self._get_headers())
385407
response.raise_for_status()
386408
return response
387409
except (httpx.HTTPStatusError, httpx.RequestError) as e:

haystack/tracing/datadog.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def set_tag(self, key: str, value: Any) -> None:
3131
:param value: the value of the tag.
3232
"""
3333
coerced_value = tracing_utils.coerce_tag_value(value)
34-
self._span.set_tag(key, coerced_value)
34+
self._span.set_tag(key, str(coerced_value))
3535

3636
def raw_span(self) -> Any:
3737
"""
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
Ensure that requests headers keys are unique in link_content

0 commit comments

Comments
 (0)