Skip to content

Commit

Permalink
[pipeline] Allow for batch indexing when using Pipelines fix #1168 (#…
Browse files Browse the repository at this point in the history
…1231)

* [pipeline] Allow for batch indexing when using Pipelines fix #1168

* [pipeline] Test case fixed fix #1168

* [file_converter] Path.suffix updated #1168

* [file_converter] meta can be one of these three cases:
                 A single dict that is applied to all files
                 One dict for each file being converted
                 None #1168

* [file_converter] mypy error fixed.

* [file_converter] mypy error fixed.

* [rest_api] batch file upload introduced in indexing API.

* [test_case] Test_api file upload parameter name updated.

* [ui] Streamlit file upload parameter updated.
  • Loading branch information
akkefa committed Jun 30, 2021
1 parent 5e23e72 commit 29e1401
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 31 deletions.
55 changes: 42 additions & 13 deletions haystack/file_converter/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import abstractmethod
from pathlib import Path
from typing import List, Optional, Dict, Any
from typing import List, Optional, Dict, Any, Union

import langdetect

Expand Down Expand Up @@ -82,16 +82,27 @@ def validate_language(self, text: str) -> bool:
else:
return False

def run(self, file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, # type: ignore
valid_languages: Optional[List[str]] = None, **kwargs): # type: ignore
document = self.convert(
file_path=file_path,
meta=meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
)
def run(self, file_paths: Union[Path, List[Path]], # type: ignore
meta: Optional[Union[Dict[str, str], List[Dict[str, str]]]] = None, # type: ignore
remove_numeric_tables: Optional[bool] = None, # type: ignore
valid_languages: Optional[List[str]] = None, **kwargs): # type: ignore

result = {"document": document, **kwargs}
if isinstance(file_paths, Path):
file_paths = [file_paths]

if meta is None or isinstance(meta, dict):
meta = [meta] * len(file_paths) # type: ignore

documents: list = []
for file_path, file_meta in zip(file_paths, meta):
documents.append(self.convert(
file_path=file_path,
meta=file_meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
))

result = {"documents": documents, **kwargs}
return result, "output_1"


Expand All @@ -101,9 +112,27 @@ class FileTypeClassifier(BaseComponent):
"""
outgoing_edges = 5

def run(self, file_path: Path, **kwargs): # type: ignore
output = {"file_path": file_path, **kwargs}
ext = file_path.name.split(".")[-1].lower()
def _get_files_extension(self, file_paths: list) -> set:
"""
Return the file extensions
:param file_paths:
:return: set
"""
return {file_path.suffix.lstrip('.') for file_path in file_paths}

def run(self, file_paths: Union[Path, List[Path]], **kwargs): # type: ignore
"""
Return the output based on file extension
"""
if isinstance(file_paths, Path):
file_paths = [file_paths]

extension: set = self._get_files_extension(file_paths)
if len(extension) > 1:
raise ValueError(f"Multiple files types are not allowed at once.")

output = {"file_paths": file_paths, **kwargs}
ext: str = extension.pop()
try:
index = ["txt", "pdf", "md", "docx", "html"].index(ext) + 1
return output, f"output_{index}"
Expand Down
4 changes: 2 additions & 2 deletions haystack/preprocessor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def split(

def run( # type: ignore
self,
document: dict,
documents: Union[dict, List[dict]],
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
Expand All @@ -50,7 +50,7 @@ def run( # type: ignore
**kwargs,
):
documents = self.process(
documents=document,
documents=documents,
clean_whitespace=clean_whitespace,
clean_header_footer=clean_header_footer,
clean_empty_lines=clean_empty_lines,
Expand Down
33 changes: 20 additions & 13 deletions rest_api/controller/file_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

@router.post("/file-upload")
def file_upload(
file: UploadFile = File(...),
files: List[UploadFile] = File(...),
meta: Optional[str] = Form("null"), # JSON serialized string
remove_numeric_tables: Optional[bool] = Form(None),
remove_whitespace: Optional[bool] = Form(None),
Expand All @@ -40,16 +40,25 @@ def file_upload(
):
if not INDEXING_PIPELINE:
raise HTTPException(status_code=501, detail="Indexing Pipeline is not configured.")
try:
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
with file_path.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)

meta = json.loads(meta) or {}
meta["name"] = file.filename
file_paths: list = []
file_metas: list = []
meta = json.loads(meta) or {}

INDEXING_PIPELINE.run(
file_path=file_path,
for file in files:
try:
file_path = Path(FILE_UPLOAD_PATH) / f"{uuid.uuid4().hex}_{file.filename}"
with file_path.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)

file_paths.append(file_path)
meta["name"] = file.filename
file_metas.append(meta)
finally:
file.file.close()

INDEXING_PIPELINE.run(
file_paths=file_paths,
remove_numeric_tables=remove_numeric_tables,
remove_whitespace=remove_whitespace,
remove_empty_lines=remove_empty_lines,
Expand All @@ -59,7 +68,5 @@ def file_upload(
split_length=split_length,
split_overlap=split_overlap,
split_respect_sentence_boundary=split_respect_sentence_boundary,
meta=meta,
)
finally:
file.file.close()
meta=file_metas,
)
2 changes: 1 addition & 1 deletion test/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_load_and_save_yaml(document_store_with_docs, tmp_path):
Path("samples/pipeline/test_pipeline.yaml"), pipeline_name="indexing_pipeline"
)
pipeline.run(
file_path=Path("samples/pdf/sample_pdf_1.pdf"),
file_paths=Path("samples/pdf/sample_pdf_1.pdf"),
top_k_retriever=10,
top_k_reader=3,
)
Expand Down
2 changes: 1 addition & 1 deletion test/test_rest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_api(reader, document_store):
client = get_test_client_and_override_dependencies()

# test file upload API
file_to_upload = {'file': Path("samples/pdf/sample_pdf_1.pdf").open('rb')}
file_to_upload = {'files': Path("samples/pdf/sample_pdf_1.pdf").open('rb')}
response = client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
assert 200 == response.status_code

Expand Down
2 changes: 1 addition & 1 deletion ui/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@ def feedback_doc(question, is_correct_answer, document_id, model_id, is_correct_

def upload_doc(file):
url = f"{API_ENDPOINT}/{DOC_UPLOAD}"
files = [("file", file)]
files = [("files", file)]
response_raw = requests.post(url, files=files).json()
return response_raw

0 comments on commit 29e1401

Please sign in to comment.