Skip to content

Commit

Permalink
feat: Set ByteStream's mime_type attribute for web based resources (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
vblagoje committed May 13, 2024
1 parent 1d20ac3 commit 811b93d
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 6 deletions.
1 change: 1 addition & 0 deletions haystack/components/fetchers/link_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def run(self, urls: List[str]):
for stream_metadata, stream in results: # type: ignore
if stream_metadata is not None and stream is not None:
stream.meta.update(stream_metadata)
stream.mime_type = stream.meta.get("content_type", None)
streams.append(stream)

return {"streams": streams}
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/routers/file_type_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, List[Uni
if isinstance(source, Path):
mime_type = self._get_mime_type(source)
elif isinstance(source, ByteStream):
mime_type = source.meta.get("content_type", None)
mime_type = source.mime_type
else:
raise ValueError(f"Unsupported data source type: {type(source).__name__}")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Improved MIME type management by directly setting MIME types on ByteStreams, enhancing the overall handling and routing of different file types. This update makes MIME type data more consistently accessible and simplifies the process of working with various document formats.
13 changes: 8 additions & 5 deletions test/components/routers/test_file_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_run_with_bytestreams(self, test_files_path):
byte_streams = []
for path, mime_type in zip(file_paths, mime_types):
stream = ByteStream(path.read_bytes())
stream.meta["content_type"] = mime_type
stream.mime_type = mime_type
byte_streams.append(stream)

# add unclassified ByteStream
Expand Down Expand Up @@ -81,7 +81,7 @@ def test_run_with_bytestreams_and_file_paths(self, test_files_path):
byte_stream_sources = []
for path, mime_type in zip(file_paths, mime_types):
stream = ByteStream(path.read_bytes())
stream.meta["content_type"] = mime_type
stream.mime_type = mime_type
byte_stream_sources.append(stream)

mixed_sources = file_paths[:2] + byte_stream_sources[2:]
Expand Down Expand Up @@ -165,9 +165,12 @@ def test_exact_mime_type_matching(self, mock_file):
"""
Test if the component correctly matches mime types exactly, without regex patterns.
"""
txt_stream = ByteStream(io.BytesIO(b"Text file content"), meta={"content_type": "text/plain"})
jpg_stream = ByteStream(io.BytesIO(b"JPEG file content"), meta={"content_type": "image/jpeg"})
mp3_stream = ByteStream(io.BytesIO(b"MP3 file content"), meta={"content_type": "audio/mpeg"})
txt_stream = ByteStream(io.BytesIO(b"Text file content").read())
txt_stream.mime_type = "text/plain"
jpg_stream = ByteStream(io.BytesIO(b"JPEG file content").read())
jpg_stream.mime_type = "image/jpeg"
mp3_stream = ByteStream(io.BytesIO(b"MP3 file content").read())
mp3_stream.mime_type = "audio/mpeg"

byte_streams = [txt_stream, jpg_stream, mp3_stream]

Expand Down

0 comments on commit 811b93d

Please sign in to comment.