|
22 | 22 |
|
23 | 23 | logger = logging.getLogger(__name__) |
24 | 24 |
|
25 | | - |
26 | 25 | DEFAULT_USER_AGENT = f"haystack/LinkContentFetcher/{__version__}" |
27 | 26 |
|
28 | 27 | REQUEST_HEADERS = { |
|
33 | 32 | } |
34 | 33 |
|
35 | 34 |
|
| 35 | +def _merge_headers(*args: dict[str, str]) -> dict[str, str]: |
| 36 | + """ |
| 37 | + Merge a list of dict using case-insensitively |
| 38 | +
|
| 39 | + :param args: a list of dict to merge |
| 40 | + :returns: The merged dict |
| 41 | + """ |
| 42 | + merged = {} |
| 43 | + keymap = {} |
| 44 | + |
| 45 | + for d in args: |
| 46 | + for k, v in d.items(): |
| 47 | + kl = k.lower() |
| 48 | + keymap[kl] = k |
| 49 | + merged[kl] = v |
| 50 | + |
| 51 | + return {keymap[kl]: v for kl, v in merged.items()} |
| 52 | + |
| 53 | + |
36 | 54 | def _text_content_handler(response: httpx.Response) -> ByteStream: |
37 | 55 | """ |
38 | 56 | Handles text content. |
@@ -169,17 +187,24 @@ def __init__( # pylint: disable=too-many-positional-arguments |
169 | 187 | after=self._switch_user_agent, |
170 | 188 | ) |
171 | 189 | def get_response(url): |
172 | | - # Build headers with precedence: |
173 | | - # client defaults -> component defaults -> user-provided -> rotating UA |
174 | | - base = dict(self._client.headers) |
175 | | - headers = {**base, **REQUEST_HEADERS, **self.request_headers} |
176 | | - headers["User-Agent"] = self.user_agents[self.current_user_agent_idx] # rotation wins |
177 | | - response = self._client.get(url, headers=headers) |
| 190 | + response = self._client.get(url, headers=self._get_headers()) |
178 | 191 | response.raise_for_status() |
179 | 192 | return response |
180 | 193 |
|
181 | 194 | self._get_response: Callable = get_response |
182 | 195 |
|
| 196 | + def _get_headers(self): |
| 197 | + """ |
| 198 | + Build headers with precedence |
| 199 | +
|
| 200 | + client defaults -> component defaults -> user-provided -> rotating UA |
| 201 | + """ |
| 202 | + base = dict(self._client.headers) |
| 203 | + headers = _merge_headers( |
| 204 | + base, REQUEST_HEADERS, self.request_headers, {"User-Agent": self.user_agents[self.current_user_agent_idx]} |
| 205 | + ) |
| 206 | + return headers |
| 207 | + |
183 | 208 | def __del__(self): |
184 | 209 | """ |
185 | 210 | Clean up resources when the component is deleted. |
@@ -378,10 +403,7 @@ async def _get_response_async(self, url: str, client: httpx.AsyncClient) -> http |
378 | 403 |
|
379 | 404 | while attempt <= self.retry_attempts: |
380 | 405 | try: |
381 | | - base = dict(client.headers) |
382 | | - headers = {**base, **REQUEST_HEADERS, **self.request_headers} |
383 | | - headers["User-Agent"] = self.user_agents[self.current_user_agent_idx] |
384 | | - response = await client.get(url, headers=headers) |
| 406 | + response = await client.get(url, headers=self._get_headers()) |
385 | 407 | response.raise_for_status() |
386 | 408 | return response |
387 | 409 | except (httpx.HTTPStatusError, httpx.RequestError) as e: |
|
0 commit comments