From 0363ef12d54cb16a18c928d837337a6b41108d7f Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 2 Mar 2023 13:46:10 +0100 Subject: [PATCH 01/32] Starting adding support for TableCell --- haystack/nodes/reader/table.py | 27 +++++++-------------- haystack/schema.py | 43 +++++++++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index 1d445189e7..962d54f2c6 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -23,7 +23,7 @@ from transformers.models.tapas.modeling_tapas import TapasPreTrainedModel from haystack.errors import HaystackError -from haystack.schema import Document, Answer, Span +from haystack.schema import Document, Answer, TableCell from haystack.nodes.reader.base import BaseReader from haystack.modeling.utils import initialize_device_settings @@ -289,7 +289,7 @@ def _predict_tapas(self, inputs: BatchEncoding, document: Document) -> Answer: else: answer_str = self._aggregate_answers(current_aggregation_operator, current_answer_cells) - answer_offsets = _calculate_answer_offsets(current_answer_coordinates, string_table) + answer_offsets = _calculate_answer_offsets(current_answer_coordinates) answer = Answer( answer=answer_str, @@ -501,7 +501,7 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu for answer_span_idx in top_k_answer_spans.indices: current_answer_span = possible_answer_spans[answer_span_idx] answer_str = string_table.iat[current_answer_span[:2]] - answer_offsets = _calculate_answer_offsets([current_answer_span[:2]], string_table) + answer_offsets = _calculate_answer_offsets([current_answer_span[:2]]) # As the general table score is more important for the final score, it is double weighted. current_score = ((2 * table_relevancy_prob) + span_logits_softmax[0, answer_span_idx].item()) / 3 @@ -544,8 +544,9 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict: type="extractive", score=no_answer_score, context=None, - offsets_in_context=[Span(start=0, end=0)], - offsets_in_document=[Span(start=0, end=0)], + # TODO Confirm that 0, 0 is a valid way to deduce no answer or if -1 will need to be used + offsets_in_context=[TableCell(row=0, col=0)], + offsets_in_document=[TableCell(row=0, col=0)], document_ids=None, meta=None, ) @@ -753,7 +754,7 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = cell_scores_table[-1].append(current_cell_score) answer_str = string_table.iloc[row_idx, col_idx] - answer_offsets = self._calculate_answer_offsets(row_idx, col_idx, string_table) + answer_offsets = TableCell(row=row_idx, col=col_idx) current_answers.append( Answer( answer=answer_str, @@ -796,13 +797,6 @@ def _create_row_column_representations(table: pd.DataFrame) -> Tuple[List[str], return row_reps, column_reps - @staticmethod - def _calculate_answer_offsets(row_idx, column_index, table) -> Span: - _, n_columns = table.shape - answer_cell_offset = (row_idx * n_columns) + column_index - - return Span(start=answer_cell_offset, end=answer_cell_offset + 1) - def predict_batch( self, queries: List[str], @@ -834,18 +828,15 @@ def predict_batch( return results -def _calculate_answer_offsets(answer_coordinates: List[Tuple[int, int]], table: pd.DataFrame) -> List[Span]: +def _calculate_answer_offsets(answer_coordinates: List[Tuple[int, int]]) -> List[TableCell]: """ Calculates the answer cell offsets of the linearized table based on the answer cell coordinates. :param answer_coordinates: List of answer coordinates. - :param table: Table containing the answers in answer coordinates. """ answer_offsets = [] - _, n_columns = table.shape for coord in answer_coordinates: - answer_cell_offset = (coord[0] * n_columns) + coord[1] - answer_offsets.append(Span(start=answer_cell_offset, end=answer_cell_offset + 1)) + answer_offsets.append(TableCell(row=coord[0], col=coord[1])) return answer_offsets diff --git a/haystack/schema.py b/haystack/schema.py index 0bda792214..f8e1493e0f 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -329,10 +329,9 @@ class Span: """ Defining a sequence of characters (Text span) or cells (Table span) via start and end index. For extractive QA: Character where answer starts/ends - For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table) :param start: Position where the span starts - :param end: Position where the spand ends + :param end: Position where the span ends """ def __contains__(self, value): @@ -377,14 +376,26 @@ def __contains__(self, value): ) from e +@dataclass +class TableCell: + row: int + col: int + """ + Defining a table cell via the row and column index. + + :param row: Row index of the cell + :param col: Column index of the cell + """ + + @dataclass class Answer: answer: str type: Literal["generative", "extractive", "other"] = "extractive" score: Optional[float] = None context: Optional[Union[str, pd.DataFrame]] = None - offsets_in_document: Optional[List[Span]] = None - offsets_in_context: Optional[List[Span]] = None + offsets_in_document: Optional[List[Span], List[TableCell]] = None + offsets_in_context: Optional[List[Span], List[TableCell]] = None document_ids: Optional[List[str]] = None meta: Optional[Dict[str, Any]] = None @@ -420,9 +431,28 @@ def __post_init__(self): # In case offsets are passed as dicts rather than Span objects we convert them here # For example, this is used when instantiating an object via from_json() if self.offsets_in_document is not None: - self.offsets_in_document = [Span(**e) if isinstance(e, dict) else e for e in self.offsets_in_document] + offsets_in_document = [] + for e in self.offsets_in_document: + if isinstance(e, dict): + if "row" in e: # is a TableCell + offsets_in_document.append(TableCell(**e)) + else: + offsets_in_document.append(Span(**e)) + else: + offsets_in_document.append(e) + self.offsets_in_document = offsets_in_document + if self.offsets_in_context is not None: - self.offsets_in_context = [Span(**e) if isinstance(e, dict) else e for e in self.offsets_in_context] + offsets_in_context = [] + for e in self.offsets_in_context: + if isinstance(e, dict): + if "row" in e: # is a TableCell + offsets_in_context.append(TableCell(**e)) + else: + offsets_in_context.append(Span(**e)) + else: + offsets_in_context.append(e) + self.offsets_in_context = offsets_in_context if self.meta is None: self.meta = {} @@ -721,6 +751,7 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ self._offsets_in_documents = [] self._offsets_in_contexts = [] for answer in answered: + # TODO This assumes Span objects when aggregating offsets if answer.offsets_in_document is not None: for span in answer.offsets_in_document: self._offsets_in_documents.append({"start": span.start, "end": span.end}) From 97fb284c31716a953aaaa129c4d9b5f76338b3af Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 2 Mar 2023 13:54:28 +0100 Subject: [PATCH 02/32] Fixed typing error by adding correct import and started to update tests for table reader --- haystack/__init__.py | 2 +- haystack/schema.py | 4 ++-- test/nodes/test_table_reader.py | 42 ++++++++++++++++----------------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/haystack/__init__.py b/haystack/__init__.py index c74c995fc3..b535027530 100644 --- a/haystack/__init__.py +++ b/haystack/__init__.py @@ -17,7 +17,7 @@ import pandas as pd -from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult +from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult, TableCell from haystack.nodes.base import BaseComponent from haystack.pipelines.base import Pipeline from haystack.environment import set_pytorch_secure_model_loading diff --git a/haystack/schema.py b/haystack/schema.py index f8e1493e0f..28ab5db799 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -394,8 +394,8 @@ class Answer: type: Literal["generative", "extractive", "other"] = "extractive" score: Optional[float] = None context: Optional[Union[str, pd.DataFrame]] = None - offsets_in_document: Optional[List[Span], List[TableCell]] = None - offsets_in_context: Optional[List[Span], List[TableCell]] = None + offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None + offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None document_ids: Optional[List[str]] = None meta: Optional[Dict[str, Any]] = None diff --git a/test/nodes/test_table_reader.py b/test/nodes/test_table_reader.py index b078508cbb..ba899e3b8c 100644 --- a/test/nodes/test_table_reader.py +++ b/test/nodes/test_table_reader.py @@ -59,19 +59,19 @@ def test_table_reader(table_reader_and_param, table1, table2): reference1 = {"tapas_small": {"score": 1.0}, "rci": {"score": -6.5301}, "tapas_scored": {"score": 0.50568}} assert prediction["answers"][0].score == pytest.approx(reference1[param]["score"], rel=1e-3) assert prediction["answers"][0].answer == "11 november 1974" - assert prediction["answers"][0].offsets_in_context[0].start == 7 - assert prediction["answers"][0].offsets_in_context[0].end == 8 + assert prediction["answers"][0].offsets_in_context[0].row == 1 + assert prediction["answers"][0].offsets_in_context[0].col == 3 # Check the second answer in the list reference2 = { - "tapas_small": {"answer": "5 april 1980", "start": 7, "end": 8, "score": 0.86314}, - "rci": {"answer": "47", "start": 5, "end": 6, "score": -6.836}, - "tapas_scored": {"answer": "brad pitt", "start": 0, "end": 1, "score": 0.49078}, + "tapas_small": {"answer": "5 april 1980", "row": 7, "col": 8, "score": 0.86314}, + "rci": {"answer": "47", "row": 5, "col": 6, "score": -6.836}, + "tapas_scored": {"answer": "brad pitt", "row": 0, "col": 1, "score": 0.49078}, } assert prediction["answers"][1].score == pytest.approx(reference2[param]["score"], rel=1e-3) assert prediction["answers"][1].answer == reference2[param]["answer"] - assert prediction["answers"][1].offsets_in_context[0].start == reference2[param]["start"] - assert prediction["answers"][1].offsets_in_context[0].end == reference2[param]["end"] + assert prediction["answers"][1].offsets_in_context[0].row == reference2[param]["row"] + assert prediction["answers"][1].offsets_in_context[0].col == reference2[param]["col"] @pytest.mark.parametrize("table_reader_and_param", ["tapas_small", "rci", "tapas_scored"], indirect=True) @@ -97,14 +97,14 @@ def test_table_reader_train_mode(table_reader_and_param, table1, table2): # Check the second answer in the list reference2 = { - "tapas_small": {"answer": "5 april 1980", "start": 7, "end": 8, "score": 0.86314}, - "rci": {"answer": "47", "start": 5, "end": 6, "score": -6.836}, - "tapas_scored": {"answer": "brad pitt", "start": 0, "end": 1, "score": 0.49078}, + "tapas_small": {"answer": "5 april 1980", "row": 7, "col": 8, "score": 0.86314}, + "rci": {"answer": "47", "row": 5, "col": 6, "score": -6.836}, + "tapas_scored": {"answer": "brad pitt", "row": 0, "col": 1, "score": 0.49078}, } assert prediction["answers"][1].score == pytest.approx(reference2[param]["score"], rel=1e-3) assert prediction["answers"][1].answer == reference2[param]["answer"] - assert prediction["answers"][1].offsets_in_context[0].start == reference2[param]["start"] - assert prediction["answers"][1].offsets_in_context[0].end == reference2[param]["end"] + assert prediction["answers"][1].offsets_in_context[0].row == reference2[param]["row"] + assert prediction["answers"][1].offsets_in_context[0].col == reference2[param]["col"] # Set back to old_seed torch.manual_seed(old_seed) @@ -138,19 +138,19 @@ def test_table_reader_batch_single_query_single_doc_list(table_reader_and_param, score_reference = {"tapas_small": {"score": 1.0}, "rci": {"score": -6.5301}, "tapas_scored": {"score": 0.50568}} assert prediction["answers"][0][0].score == pytest.approx(score_reference[param]["score"], rel=1e-3) assert prediction["answers"][0][0].answer == "11 november 1974" - assert prediction["answers"][0][0].offsets_in_context[0].start == 7 - assert prediction["answers"][0][0].offsets_in_context[0].end == 8 + assert prediction["answers"][0][0].offsets_in_context[0].row == 1 + assert prediction["answers"][0][0].offsets_in_context[0].col == 3 # Check first answer from the 2ND Document ans_reference = { - "tapas_small": {"answer": "5 april 1980", "start": 7, "end": 8, "score": 0.86314}, - "rci": {"answer": "15 september 1960", "start": 11, "end": 12, "score": -7.9429}, - "tapas_scored": {"answer": "5", "start": 10, "end": 11, "score": 0.11485}, + "tapas_small": {"answer": "5 april 1980", "row": 7, "col": 8, "score": 0.86314}, + "rci": {"answer": "15 september 1960", "row": 11, "col": 12, "score": -7.9429}, + "tapas_scored": {"answer": "5", "row": 10, "col": 11, "score": 0.11485}, } assert prediction["answers"][1][0].score == pytest.approx(ans_reference[param]["score"], rel=1e-3) assert prediction["answers"][1][0].answer == ans_reference[param]["answer"] - assert prediction["answers"][1][0].offsets_in_context[0].start == ans_reference[param]["start"] - assert prediction["answers"][1][0].offsets_in_context[0].end == ans_reference[param]["end"] + assert prediction["answers"][1][0].offsets_in_context[0].row == ans_reference[param]["row"] + assert prediction["answers"][1][0].offsets_in_context[0].col == ans_reference[param]["col"] @pytest.mark.parametrize("table_reader_and_param", ["tapas_small", "rci", "tapas_scored"], indirect=True) @@ -252,8 +252,8 @@ def test_table_reader_in_pipeline(table_reader_and_param, table1): prediction = pipeline.run(query=query, documents=[Document(content=table1, content_type="table")]) assert prediction["answers"][0].answer == "11 november 1974" - assert prediction["answers"][0].offsets_in_context[0].start == 7 - assert prediction["answers"][0].offsets_in_context[0].end == 8 + assert prediction["answers"][0].offsets_in_context[0].row == 1 + assert prediction["answers"][0].offsets_in_context[0].col == 3 @pytest.mark.parametrize("table_reader_and_param", ["tapas_base"], indirect=True) From 905edaac3a2cf2a41b913821fd1858861226cc54 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 2 Mar 2023 16:45:47 +0100 Subject: [PATCH 03/32] Update tests to use row and col --- test/nodes/test_table_reader.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/nodes/test_table_reader.py b/test/nodes/test_table_reader.py index ba899e3b8c..7a58ed3f06 100644 --- a/test/nodes/test_table_reader.py +++ b/test/nodes/test_table_reader.py @@ -64,9 +64,9 @@ def test_table_reader(table_reader_and_param, table1, table2): # Check the second answer in the list reference2 = { - "tapas_small": {"answer": "5 april 1980", "row": 7, "col": 8, "score": 0.86314}, - "rci": {"answer": "47", "row": 5, "col": 6, "score": -6.836}, - "tapas_scored": {"answer": "brad pitt", "row": 0, "col": 1, "score": 0.49078}, + "tapas_small": {"answer": "5 april 1980", "row": 1, "col": 3, "score": 0.86314}, + "rci": {"answer": "47", "row": 1, "col": 1, "score": -6.836}, + "tapas_scored": {"answer": "brad pitt", "row": 0, "col": 0, "score": 0.49078}, } assert prediction["answers"][1].score == pytest.approx(reference2[param]["score"], rel=1e-3) assert prediction["answers"][1].answer == reference2[param]["answer"] @@ -97,9 +97,9 @@ def test_table_reader_train_mode(table_reader_and_param, table1, table2): # Check the second answer in the list reference2 = { - "tapas_small": {"answer": "5 april 1980", "row": 7, "col": 8, "score": 0.86314}, - "rci": {"answer": "47", "row": 5, "col": 6, "score": -6.836}, - "tapas_scored": {"answer": "brad pitt", "row": 0, "col": 1, "score": 0.49078}, + "tapas_small": {"answer": "5 april 1980", "row": 1, "col": 3, "score": 0.86314}, + "rci": {"answer": "47", "row": 1, "col": 1, "score": -6.836}, + "tapas_scored": {"answer": "brad pitt", "row": 0, "col": 0, "score": 0.49078}, } assert prediction["answers"][1].score == pytest.approx(reference2[param]["score"], rel=1e-3) assert prediction["answers"][1].answer == reference2[param]["answer"] @@ -143,9 +143,9 @@ def test_table_reader_batch_single_query_single_doc_list(table_reader_and_param, # Check first answer from the 2ND Document ans_reference = { - "tapas_small": {"answer": "5 april 1980", "row": 7, "col": 8, "score": 0.86314}, - "rci": {"answer": "15 september 1960", "row": 11, "col": 12, "score": -7.9429}, - "tapas_scored": {"answer": "5", "row": 10, "col": 11, "score": 0.11485}, + "tapas_small": {"answer": "5 april 1980", "row": 1, "col": 3, "score": 0.86314}, + "rci": {"answer": "15 september 1960", "row": 2, "col": 3, "score": -7.9429}, + "tapas_scored": {"answer": "5", "row": 2, "col": 2, "score": 0.11485}, } assert prediction["answers"][1][0].score == pytest.approx(ans_reference[param]["score"], rel=1e-3) assert prediction["answers"][1][0].answer == ans_reference[param]["answer"] From d382f1dc07290d0b2a5fe61d907e6175dac16ef6 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 2 Mar 2023 16:54:03 +0100 Subject: [PATCH 04/32] Added TODO for adding a test for tableQA eval --- test/pipelines/test_eval.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/pipelines/test_eval.py b/test/pipelines/test_eval.py index 069f2bb18a..ac4636210b 100644 --- a/test/pipelines/test_eval.py +++ b/test/pipelines/test_eval.py @@ -246,6 +246,9 @@ def test_eval_data_split_word(document_store): assert len(set(labels[0].document_ids)) == 2 +# TODO Add test for TableQA eval + + @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True) def test_eval_data_split_passage(document_store): # splitting by passage From eed82c4d9db6b83b39d6605adccc5e993d88b0b9 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Tue, 7 Mar 2023 10:52:32 +0100 Subject: [PATCH 05/32] Added schema test to check to_dict and from_dict works for Table documents. Also updated Doc.__eq__ to work for tables. --- haystack/schema.py | 7 ++++++- test/others/test_schema.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/haystack/schema.py b/haystack/schema.py index a1f6cbf869..a1bd76a94f 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -250,9 +250,14 @@ def from_json(cls, data: str, field_map: Optional[Dict[str, Any]] = None) -> Doc return cls.from_dict(dictionary, field_map=field_map) def __eq__(self, other): + content = getattr(other, "content", None) + if isinstance(content, pd.DataFrame): + is_content_equal = content.equals(self.content) + else: + is_content_equal = content == self.content return ( isinstance(other, self.__class__) - and getattr(other, "content", None) == self.content + and is_content_equal and getattr(other, "content_type", None) == self.content_type and getattr(other, "id", None) == self.id and getattr(other, "id_hash_keys", None) == self.id_hash_keys diff --git a/test/others/test_schema.py b/test/others/test_schema.py index eec9f98b64..ef1f9b3fda 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -52,6 +52,19 @@ def test_document_from_dict(): assert doc == Document.from_dict(doc.to_dict()) +def test_table_document_from_dict(): + data = { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": [58, 47, 60], + "number of movies": [87, 53, 69], + "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"], + } + doc = Document( + content=pd.DataFrame(data), meta={"some": "meta"}, id_hash_keys=["content", "meta"], content_type="table" + ) + assert doc == Document.from_dict(doc.to_dict()) + + def test_no_answer_label(): labels = [ Label( @@ -428,6 +441,7 @@ def test_multilabel_id(): assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605" +# TODO Expand test to check TableCell def test_multilabel_with_doc_containing_dataframes(): label = Label( query="A question", From ec96908a5b300837393180b2af7f9db10c4b579c Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Tue, 7 Mar 2023 11:14:48 +0100 Subject: [PATCH 06/32] Updated Multilabel to use builtin document.to_dict() method to build serializable doc.id and doc.content. Updated schema test for Table label. --- haystack/schema.py | 6 +++--- test/others/test_schema.py | 13 +++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index a1bd76a94f..fe89d2d3f2 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -770,7 +770,7 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ self._offsets_in_documents = [] self._offsets_in_contexts = [] for answer in answered: - # TODO This assumes Span objects when aggregating offsets + # TODO This assumes Span objects when aggregating offsets. Update to use TableCell when appropriate if answer.offsets_in_document is not None: for span in answer.offsets_in_document: self._offsets_in_documents.append({"start": span.start, "end": span.end}) @@ -786,8 +786,8 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ # as separate no_answer labels, and thus with document.id but without answer.document_id. # If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer. # Hence, we exclude them here as well. - self._document_ids = [l.document.id for l in self._labels if not l.no_answer] - self._contexts = [str(l.document.content) for l in self._labels if not l.no_answer] + self._document_ids = [l.document.to_dict()["id"] for l in self._labels if not l.no_answer] + self._contexts = [l.document.to_dict()["content"] for l in self._labels if not l.no_answer] @property def labels(self): diff --git a/test/others/test_schema.py b/test/others/test_schema.py index ef1f9b3fda..2c97ffdf08 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -443,16 +443,21 @@ def test_multilabel_id(): # TODO Expand test to check TableCell def test_multilabel_with_doc_containing_dataframes(): + table = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + table_doc = Document(content=table, content_type="table", id="table1") label = Label( query="A question", - document=Document(content=pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})), + document=table_doc, is_correct_answer=True, is_correct_document=True, origin="gold-label", - answer=Answer(answer="answer 1"), + answer=Answer( + answer="1", context=table, offsets_in_document=None, offsets_in_context=None, document_ids=[table_doc.id] + ), ) - assert len(MultiLabel(labels=[label]).contexts) == 1 - assert type(MultiLabel(labels=[label]).contexts[0]) is str + multilabel = MultiLabel(labels=[label]) + assert len(multilabel.contexts) == 1 + assert isinstance(multilabel.contexts[0], list) def test_multilabel_serialization(): From ba44009f924488d0aaadb31ce9a7d1eab1403147 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Tue, 7 Mar 2023 12:01:47 +0100 Subject: [PATCH 07/32] Added test for multilabel serialization for tables. Had to add List[List] as valid type for context in Answer. --- haystack/schema.py | 13 ++++++--- test/others/test_schema.py | 56 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index fe89d2d3f2..6bdca3e996 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -412,7 +412,7 @@ class Answer: answer: str type: Literal["generative", "extractive", "other"] = "extractive" score: Optional[float] = None - context: Optional[Union[str, pd.DataFrame]] = None + context: Optional[Union[str, pd.DataFrame, List[List]]] = None offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None document_ids: Optional[List[str]] = None @@ -770,13 +770,18 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ self._offsets_in_documents = [] self._offsets_in_contexts = [] for answer in answered: - # TODO This assumes Span objects when aggregating offsets. Update to use TableCell when appropriate if answer.offsets_in_document is not None: for span in answer.offsets_in_document: - self._offsets_in_documents.append({"start": span.start, "end": span.end}) + if isinstance(span, TableCell): + self._offsets_in_documents.append({"row": span.row, "col": span.col}) + else: + self._offsets_in_documents.append({"start": span.start, "end": span.end}) if answer.offsets_in_context is not None: for span in answer.offsets_in_context: - self._offsets_in_contexts.append({"start": span.start, "end": span.end}) + if isinstance(span, TableCell): + self._offsets_in_contexts.append({"row": span.row, "col": span.col}) + else: + self._offsets_in_contexts.append({"start": span.start, "end": span.end}) # There are two options here to represent document_ids: # taking the id from the document of each label or taking the document_id of each label's answer. diff --git a/test/others/test_schema.py b/test/others/test_schema.py index 2c97ffdf08..be606eaa24 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -1,4 +1,4 @@ -from haystack.schema import Document, Label, Answer, Span, MultiLabel, SpeechDocument, SpeechAnswer +from haystack.schema import Document, Label, Answer, Span, MultiLabel, SpeechDocument, SpeechAnswer, TableCell import pytest import numpy as np import pandas as pd @@ -441,7 +441,6 @@ def test_multilabel_id(): assert MultiLabel(labels=[label3]).id == "531445fa3bdf98b8598a3bea032bd605" -# TODO Expand test to check TableCell def test_multilabel_with_doc_containing_dataframes(): table = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) table_doc = Document(content=table, content_type="table", id="table1") @@ -452,12 +451,18 @@ def test_multilabel_with_doc_containing_dataframes(): is_correct_document=True, origin="gold-label", answer=Answer( - answer="1", context=table, offsets_in_document=None, offsets_in_context=None, document_ids=[table_doc.id] + answer="1", + context=table, + offsets_in_document=[TableCell(0, 0)], + offsets_in_context=[TableCell(0, 0)], + document_ids=[table_doc.id], ), ) multilabel = MultiLabel(labels=[label]) assert len(multilabel.contexts) == 1 assert isinstance(multilabel.contexts[0], list) + assert multilabel.offsets_in_documents[0] == {"row": 0, "col": 0} + assert multilabel.offsets_in_contexts[0] == {"row": 0, "col": 0} def test_multilabel_serialization(): @@ -505,6 +510,51 @@ def test_multilabel_serialization(): assert json_deserialized_multilabel.labels[0] == label +def test_table_multilabel_serialization(): + tabel_label_dict = { + "id": "011079cf-c93f-49e6-83bb-42cd850dce12", + "query": "What is the first number?", + "document": { + "content": [["col1", "col2"], [1, 3], [2, 4]], + "content_type": "table", + "id": "table1", + "meta": {}, + "score": None, + "embedding": None, + }, + "is_correct_answer": True, + "is_correct_document": True, + "origin": "user-feedback", + "answer": { + "answer": "1", + "type": "extractive", + "score": None, + "context": [["col1", "col2"], [1, 3], [2, 4]], + "offsets_in_document": [{"row": 0, "col": 0}], + "offsets_in_context": [{"row": 0, "col": 0}], + "document_ids": ["table1"], + "meta": {}, + }, + "no_answer": False, + "pipeline_id": None, + "created_at": "2022-07-22T13:29:33.699781+00:00", + "updated_at": "2022-07-22T13:29:33.784895+00:00", + "meta": {"answer_id": "374394", "document_id": "604995", "question_id": "345530"}, + "filters": None, + } + + label = Label.from_dict(tabel_label_dict) + original_multilabel = MultiLabel([label]) + + deserialized_multilabel = MultiLabel.from_dict(original_multilabel.to_dict()) + assert deserialized_multilabel == original_multilabel + assert deserialized_multilabel.labels[0] == label + + json_deserialized_multilabel = MultiLabel.from_json(original_multilabel.to_json()) + assert json_deserialized_multilabel == original_multilabel + assert json_deserialized_multilabel.labels[0] == label + + def test_serialize_speech_document(): speech_doc = SpeechDocument( id=12345, From bf2f1f6310f90aee2da3b4bff6618bc0f2594842 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Tue, 7 Mar 2023 12:17:42 +0100 Subject: [PATCH 08/32] Resolved error from merge --- test/nodes/test_table_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nodes/test_table_reader.py b/test/nodes/test_table_reader.py index 256f75e955..a975444b5e 100644 --- a/test/nodes/test_table_reader.py +++ b/test/nodes/test_table_reader.py @@ -76,7 +76,7 @@ def test_table_reader(table_reader_and_param, table_doc1, table_doc2): @pytest.mark.integration @pytest.mark.parametrize("table_reader_and_param", ["tapas_small", "rci", "tapas_scored"], indirect=True) -def test_table_reader_batch_single_query_single_doc_list(table_reader_and_param, table1, table2): +def test_table_reader_batch_single_query_single_doc_list(table_reader_and_param, table_doc1, table_doc2): table_reader, param = table_reader_and_param query = "When was Di Caprio born?" prediction = table_reader.predict_batch(queries=[query], documents=[table_doc1, table_doc2]) From 329e35ecb29bfc93aae4022bbbb0b50950ed5b8b Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Tue, 7 Mar 2023 12:22:20 +0100 Subject: [PATCH 09/32] Don't return offsets_in_* if it is a no answer --- haystack/nodes/reader/table.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index a13b5afa35..ab71b49184 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -574,17 +574,7 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict: if self.return_no_answer: answers.append( - Answer( - answer="", - type="extractive", - score=no_answer_score, - context=None, - # TODO Confirm that 0, 0 is a valid way to deduce no answer or if -1 will need to be used - offsets_in_context=[TableCell(row=0, col=0)], - offsets_in_document=[TableCell(row=0, col=0)], - document_ids=None, - meta=None, - ) + Answer(answer="", type="extractive", score=no_answer_score, context=None, document_ids=None, meta=None) ) answers = sorted(answers, reverse=True) From 610e111ada78274e9fda88c3d57395a763a50982 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 16 Mar 2023 13:42:56 +0100 Subject: [PATCH 10/32] hanlde list of lists in context matching --- haystack/pipelines/base.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index ae6fd497be..76481e892c 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -1639,8 +1639,10 @@ def _build_eval_dataframe( df_answers["gold_contexts_similarity"] = df_answers.map_rows( lambda row: [ calculate_context_similarity( - str(gold_context), # could be dataframe - str(row["context"]) if row["context"] is not None else "", # could be dataframe + str(gold_context), # could be dataframe or list of lists + str(row["context"]) + if row["context"] is not None + else "", # could be dataframe or list of lists min_length=context_matching_min_length, boost_split_overlaps=context_matching_boost_split_overlaps, ) @@ -1773,9 +1775,13 @@ def _build_eval_dataframe( df_docs["gold_contexts_similarity"] = df_docs.map_rows( lambda row: [ calculate_context_similarity( - str(gold_context) if isinstance(gold_context, pd.DataFrame) else gold_context, + str(gold_context) + if isinstance(gold_context, (pd.DataFrame, list)) + else gold_context, # could be dataframe or list of lists str(row["context"]) - if isinstance(row["context"], pd.DataFrame) + if isinstance( + row["context"], (pd.DataFrame, list) + ) # could be dataframe or list of lists else row["context"] or "", min_length=context_matching_min_length, boost_split_overlaps=context_matching_boost_split_overlaps, From d643d05ffc697e9948bfa16f92054b1cfb9e3c2c Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 16 Mar 2023 17:38:48 +0100 Subject: [PATCH 11/32] Update eval test to use TableCell --- test/pipelines/test_eval.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/pipelines/test_eval.py b/test/pipelines/test_eval.py index 8670ec0aaa..8bcd53b139 100644 --- a/test/pipelines/test_eval.py +++ b/test/pipelines/test_eval.py @@ -21,7 +21,7 @@ TranslationWrapperPipeline, ) from haystack.nodes.translator.transformers import TransformersTranslator -from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span +from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span, TableCell from ..conftest import SAMPLES_PATH @@ -416,7 +416,7 @@ def test_eval_data_split_passage(document_store): labels=[ Label( query="How old is Brad Pitt?", - answer=Answer(answer="56", offsets_in_context=[Span(1, 2)]), + answer=Answer(answer="56", offsets_in_context=[TableCell(1, 2)]), document=Document( id="a044cf3fb8aade03a12399c7a2fe9a6b", content_type="table", @@ -435,7 +435,7 @@ def test_eval_data_split_passage(document_store): ), Label( # Label with different doc but same answer and query query="How old is Brad Pitt?", - answer=Answer(answer="56", offsets_in_context=[Span(4, 5)]), + answer=Answer(answer="56", offsets_in_context=[TableCell(4, 5)]), document=Document( id="a044cf3fb8aade03a12399c7a2fe9a6b", content_type="table", @@ -454,7 +454,7 @@ def test_eval_data_split_passage(document_store): labels=[ Label( query="To which state does Spikeroog belong?", - answer=Answer(answer="Lower Saxony", offsets_in_context=[Span(7, 8)]), + answer=Answer(answer="Lower Saxony", offsets_in_context=[TableCell(7, 8)]), document=Document( id="b044cf3fb8aade03a12399c7a2fe9a6c", content_type="table", From 5ed4e24c2edc8f281440941760c4a6bf73a1014b Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 13:25:15 +0100 Subject: [PATCH 12/32] Added more schema tests for table docs, labels and answers. --- haystack/schema.py | 2 +- test/others/test_schema.py | 193 +++++++++++++++++++++---------------- 2 files changed, 111 insertions(+), 84 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 92e768243d..749dec2b22 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -390,7 +390,7 @@ class Answer: """ def __post_init__(self): - # In case offsets are passed as dicts rather than Span objects we convert them here + # In case offsets are passed as dicts rather than Span or TableCell objects we convert them here # For example, this is used when instantiating an object via from_json() if self.offsets_in_document is not None: offsets_in_document = [] diff --git a/test/others/test_schema.py b/test/others/test_schema.py index d7e6ce7988..341c68eca5 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -5,64 +5,84 @@ from ..conftest import SAMPLES_PATH, fail_at_version -LABELS = [ - Label( - query="some", - answer=Answer( - answer="an answer", - type="extractive", - score=0.1, - document_ids=["123"], - offsets_in_document=[Span(start=1, end=3)], + +@pytest.fixture +def text_labels(): + return [ + Label( + query="some", + answer=Answer( + answer="an answer", + type="extractive", + score=0.1, + document_ids=["123"], + offsets_in_document=[Span(start=1, end=3)], + ), + document=Document(content="some text", content_type="text"), + is_correct_answer=True, + is_correct_document=True, + origin="user-feedback", ), - document=Document(content="some text", content_type="text"), - is_correct_answer=True, - is_correct_document=True, - origin="user-feedback", - ), - Label( - query="some", - answer=Answer(answer="annother answer", type="extractive", score=0.1, document_ids=["123"]), - document=Document(content="some text", content_type="text"), - is_correct_answer=True, - is_correct_document=True, - origin="user-feedback", - ), - Label( - query="some", - answer=Answer( - answer="an answer", - type="extractive", - score=0.1, - document_ids=["123"], - offsets_in_document=[Span(start=1, end=3)], + Label( + query="some", + answer=Answer(answer="annother answer", type="extractive", score=0.1, document_ids=["123"]), + document=Document(content="some text", content_type="text"), + is_correct_answer=True, + is_correct_document=True, + origin="user-feedback", ), - document=Document(content="some text", content_type="text"), - is_correct_answer=True, - is_correct_document=True, - origin="user-feedback", - ), -] + Label( + query="some", + answer=Answer( + answer="an answer", + type="extractive", + score=0.1, + document_ids=["123"], + offsets_in_document=[Span(start=1, end=3)], + ), + document=Document(content="some text", content_type="text"), + is_correct_answer=True, + is_correct_document=True, + origin="user-feedback", + ), + ] -def test_document_from_dict(): - doc = Document( - content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"] +@pytest.fixture +def text_answer(): + return Answer( + answer="an answer", + type="extractive", + score=0.1, + context="abc", + offsets_in_document=[Span(start=1, end=10)], + offsets_in_context=[Span(start=3, end=5)], + document_ids=["123"], ) - assert doc == Document.from_dict(doc.to_dict()) -def test_table_document_from_dict(): +@pytest.fixture +def table_doc(): data = { "actors": ["brad pitt", "leonardo di caprio", "george clooney"], "age": [58, 47, 60], "number of movies": [87, 53, 69], "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"], } - doc = Document( - content=pd.DataFrame(data), meta={"some": "meta"}, id_hash_keys=["content", "meta"], content_type="table" + return Document(content=pd.DataFrame(data), content_type="table", id="doc1") + + +@pytest.fixture +def table_doc_with_embedding(): + data = { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": [58, 47, 60], + "number of movies": [87, 53, 69], + "date of birth": ["18 december 1963", "11 november 1974", "6 may 1961"], + } + return Document( + content=pd.DataFrame(data), content_type="table", id="doc2", embedding=np.random.rand(768).astype(np.float32) ) - assert doc == Document.from_dict(doc.to_dict()) def test_no_answer_label(): @@ -107,21 +127,27 @@ def test_no_answer_label(): assert labels[3].no_answer == False -def test_equal_label(): - assert LABELS[2] == LABELS[0] - assert LABELS[1] != LABELS[0] +def test_equal_label(text_labels): + assert text_labels[2] == text_labels[0] + assert text_labels[1] != text_labels[0] -def test_answer_to_json(): - a = Answer( - answer="an answer", - type="extractive", - score=0.1, - context="abc", - offsets_in_document=[Span(start=1, end=10)], - offsets_in_context=[Span(start=3, end=5)], - document_ids=["123"], - ) +def test_label_to_json(text_labels): + j0 = text_labels[0].to_json() + l_new = Label.from_json(j0) + assert l_new == text_labels[0] + assert l_new.answer.offsets_in_document[0].start == 1 + + +def test_label_to_dict(text_labels): + j0 = text_labels[0].to_dict() + l_new = Label.from_dict(j0) + assert l_new == text_labels[0] + assert l_new.answer.offsets_in_document[0].start == 1 + + +def test_answer_to_json(text_answer): + a = text_answer j = a.to_json() assert type(j) == str assert len(j) > 30 @@ -130,16 +156,8 @@ def test_answer_to_json(): assert a_new == a -def test_answer_to_dict(): - a = Answer( - answer="an answer", - type="extractive", - score=0.1, - context="abc", - offsets_in_document=[Span(start=1, end=10)], - offsets_in_context=[Span(start=3, end=5)], - document_ids=["123"], - ) +def test_answer_to_dict(text_answer): + a = text_answer j = a.to_dict() assert type(j) == dict a_new = Answer.from_dict(j) @@ -147,24 +165,15 @@ def test_answer_to_dict(): assert a_new == a -def test_label_to_json(): - j0 = LABELS[0].to_json() - l_new = Label.from_json(j0) - assert l_new == LABELS[0] - - -def test_label_to_json(): - j0 = LABELS[0].to_json() - l_new = Label.from_json(j0) - assert l_new == LABELS[0] - assert l_new.answer.offsets_in_document[0].start == 1 +def test_document_from_dict(): + doc = Document( + content="this is the content of the document", meta={"some": "meta"}, id_hash_keys=["content", "meta"] + ) + assert doc == Document.from_dict(doc.to_dict()) -def test_label_to_dict(): - j0 = LABELS[0].to_dict() - l_new = Label.from_dict(j0) - assert l_new == LABELS[0] - assert l_new.answer.offsets_in_document[0].start == 1 +def test_table_document_from_dict(table_doc): + assert table_doc == Document.from_dict(table_doc.to_dict()) def test_doc_to_json(): @@ -195,12 +204,30 @@ def test_doc_to_json(): assert d == d_new +def test_table_doc_to_json(table_doc, table_doc_with_embedding): + # With embedding + j0 = table_doc_with_embedding.to_json() + d_new = Document.from_json(j0) + assert table_doc_with_embedding == d_new + + # No embedding + j0 = table_doc.to_json() + d_new = Document.from_json(j0) + assert table_doc == d_new + + def test_answer_postinit(): a = Answer(answer="test", offsets_in_document=[{"start": 10, "end": 20}]) assert a.meta == {} assert isinstance(a.offsets_in_document[0], Span) +def test_table_answer_postinit(): + a = Answer(answer="test", offsets_in_document=[{"row": 1, "col": 2}]) + assert a.meta == {} + assert isinstance(a.offsets_in_document[0], TableCell) + + def test_generate_doc_id_using_text(): text1 = "text1" text2 = "text2" From 2427ae11541085b58ee5f954b657790c6db30f3d Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 13:40:49 +0100 Subject: [PATCH 13/32] Remove unneccessary to_dict call --- haystack/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/schema.py b/haystack/schema.py index 749dec2b22..a75a2fd26c 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -677,7 +677,7 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ # as separate no_answer labels, and thus with document.id but without answer.document_id. # If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer. # Hence, we exclude them here as well. - self._document_ids = [l.document.to_dict()["id"] for l in self._labels if not l.no_answer] + self._document_ids = [l.document.id for l in self._labels if not l.no_answer] self._contexts = [l.document.to_dict()["content"] for l in self._labels if not l.no_answer] @property From e3aceec1b5875241a9984a51195bfff787b260cf Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 13:50:14 +0100 Subject: [PATCH 14/32] Expanding content type to List of Lists for Documents --- haystack/schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index a75a2fd26c..5944bf3adc 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -43,7 +43,7 @@ @dataclass class Document: id: str - content: Union[str, pd.DataFrame] + content: Union[str, pd.DataFrame, List[List]] content_type: ContentTypes = Field(default="text") meta: Dict[str, Any] = Field(default={}) id_hash_keys: List[str] = Field(default=["content"]) @@ -56,7 +56,7 @@ class Document: # don't need to passed by the user in init and are rather initialized automatically in the init def __init__( self, - content: Union[str, pd.DataFrame], + content: Union[str, pd.DataFrame, List[List]], content_type: ContentTypes = "text", id: Optional[str] = None, score: Optional[float] = None, From 6af25ff3b9938cf4d230a94f6b009cd2a1393899 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 13:58:56 +0100 Subject: [PATCH 15/32] Fixing mypy errors --- haystack/nodes/reader/farm.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index 55ffb0eed6..ad58f2e7db 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -1058,6 +1058,12 @@ def eval( if not label.document.id: logger.error("Label does not contain a document id") continue + if label.document.content_type == "table": + logger.warning( + "Label with a table document is not compatible with the FARMReader. " "Skipping label with id %s.", + label.id, + ) + continue aggregated_per_doc[label.document.id].append(label) # Create squad style dicts @@ -1101,7 +1107,7 @@ def eval( ) continue aggregated_per_question[aggregation_key]["answers"].append( - {"text": label.answer.answer, "answer_start": label.answer.offsets_in_document[0].start} + {"text": label.answer.answer, "answer_start": label.answer.offsets_in_document[0].start} # type: ignore [union-attr] ) aggregated_per_question[aggregation_key]["is_impossible"] = False # create new one @@ -1121,7 +1127,7 @@ def eval( "answers": [ { "text": label.answer.answer, - "answer_start": label.answer.offsets_in_document[0].start, + "answer_start": label.answer.offsets_in_document[0].start, # type: ignore [union-attr] } ], "is_impossible": False, From 7507bf79b5e66d9bcd183a0c33b5ee02d75c35dd Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 14:19:03 +0100 Subject: [PATCH 16/32] Fix pylint --- haystack/nodes/reader/farm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index ad58f2e7db..9e9d065a2d 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -1060,7 +1060,7 @@ def eval( continue if label.document.content_type == "table": logger.warning( - "Label with a table document is not compatible with the FARMReader. " "Skipping label with id %s.", + "Label with a table document is not compatible with the FARMReader. Skipping label with id %s.", label.id, ) continue From acfcb36d99bfdd4c9fbd47f56e493173e1141ed6 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 14:30:00 +0100 Subject: [PATCH 17/32] Updating types --- haystack/schema.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 5944bf3adc..8812d9dd87 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -43,7 +43,7 @@ @dataclass class Document: id: str - content: Union[str, pd.DataFrame, List[List]] + content: Union[str, pd.DataFrame, List[List[str]]] content_type: ContentTypes = Field(default="text") meta: Dict[str, Any] = Field(default={}) id_hash_keys: List[str] = Field(default=["content"]) @@ -56,7 +56,7 @@ class Document: # don't need to passed by the user in init and are rather initialized automatically in the init def __init__( self, - content: Union[str, pd.DataFrame, List[List]], + content: Union[str, pd.DataFrame, List[List[str]]], content_type: ContentTypes = "text", id: Optional[str] = None, score: Optional[float] = None, @@ -355,7 +355,7 @@ class Answer: answer: str type: Literal["generative", "extractive", "other"] = "extractive" score: Optional[float] = None - context: Optional[Union[str, pd.DataFrame, List[List]]] = None + context: Optional[Union[str, pd.DataFrame, List[List[str]]]] = None offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None document_ids: Optional[List[str]] = None From 1fd10feab4b6cb33dde1cd049925379d5e8028f2 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 15:24:25 +0100 Subject: [PATCH 18/32] Remove table multilabel serialization for now. Make changes in separate PR. --- haystack/schema.py | 6 ++--- test/others/test_schema.py | 45 -------------------------------------- 2 files changed, 3 insertions(+), 48 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 8812d9dd87..facc4be10f 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -43,7 +43,7 @@ @dataclass class Document: id: str - content: Union[str, pd.DataFrame, List[List[str]]] + content: Union[str, pd.DataFrame] content_type: ContentTypes = Field(default="text") meta: Dict[str, Any] = Field(default={}) id_hash_keys: List[str] = Field(default=["content"]) @@ -56,7 +56,7 @@ class Document: # don't need to passed by the user in init and are rather initialized automatically in the init def __init__( self, - content: Union[str, pd.DataFrame, List[List[str]]], + content: Union[str, pd.DataFrame], content_type: ContentTypes = "text", id: Optional[str] = None, score: Optional[float] = None, @@ -355,7 +355,7 @@ class Answer: answer: str type: Literal["generative", "extractive", "other"] = "extractive" score: Optional[float] = None - context: Optional[Union[str, pd.DataFrame, List[List[str]]]] = None + context: Optional[Union[str, pd.DataFrame]] = None offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None document_ids: Optional[List[str]] = None diff --git a/test/others/test_schema.py b/test/others/test_schema.py index 341c68eca5..1b5fd15ea8 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -537,51 +537,6 @@ def test_multilabel_serialization(): assert json_deserialized_multilabel.labels[0] == label -def test_table_multilabel_serialization(): - tabel_label_dict = { - "id": "011079cf-c93f-49e6-83bb-42cd850dce12", - "query": "What is the first number?", - "document": { - "content": [["col1", "col2"], [1, 3], [2, 4]], - "content_type": "table", - "id": "table1", - "meta": {}, - "score": None, - "embedding": None, - }, - "is_correct_answer": True, - "is_correct_document": True, - "origin": "user-feedback", - "answer": { - "answer": "1", - "type": "extractive", - "score": None, - "context": [["col1", "col2"], [1, 3], [2, 4]], - "offsets_in_document": [{"row": 0, "col": 0}], - "offsets_in_context": [{"row": 0, "col": 0}], - "document_ids": ["table1"], - "meta": {}, - }, - "no_answer": False, - "pipeline_id": None, - "created_at": "2022-07-22T13:29:33.699781+00:00", - "updated_at": "2022-07-22T13:29:33.784895+00:00", - "meta": {"answer_id": "374394", "document_id": "604995", "question_id": "345530"}, - "filters": None, - } - - label = Label.from_dict(tabel_label_dict) - original_multilabel = MultiLabel([label]) - - deserialized_multilabel = MultiLabel.from_dict(original_multilabel.to_dict()) - assert deserialized_multilabel == original_multilabel - assert deserialized_multilabel.labels[0] == label - - json_deserialized_multilabel = MultiLabel.from_json(original_multilabel.to_json()) - assert json_deserialized_multilabel == original_multilabel - assert json_deserialized_multilabel.labels[0] == label - - def test_span_in(): assert 10 in Span(5, 15) assert not 20 in Span(1, 15) From 5e8a519e4bf52292ecb53478192e43ad382a2334 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 15:26:50 +0100 Subject: [PATCH 19/32] Undo change to MultiLabel --- haystack/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/schema.py b/haystack/schema.py index facc4be10f..e6ec7ec537 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -678,7 +678,7 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ # If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer. # Hence, we exclude them here as well. self._document_ids = [l.document.id for l in self._labels if not l.no_answer] - self._contexts = [l.document.to_dict()["content"] for l in self._labels if not l.no_answer] + self._contexts = [str(l.document.content) for l in self._labels if not l.no_answer] @property def labels(self): From 92594233d39efd42b9469d3ff4f359f636492aab Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 6 Apr 2023 15:51:23 +0100 Subject: [PATCH 20/32] Fix test --- test/others/test_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/others/test_schema.py b/test/others/test_schema.py index 1b5fd15ea8..a131d98ef6 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -487,7 +487,7 @@ def test_multilabel_with_doc_containing_dataframes(): ) multilabel = MultiLabel(labels=[label]) assert len(multilabel.contexts) == 1 - assert isinstance(multilabel.contexts[0], list) + assert isinstance(multilabel.contexts[0], str) assert multilabel.offsets_in_documents[0] == {"row": 0, "col": 0} assert multilabel.offsets_in_contexts[0] == {"row": 0, "col": 0} From 79981f79dca7031f00852f3d9ed814c7b5d8ffde Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Tue, 11 Apr 2023 09:09:01 +0200 Subject: [PATCH 21/32] Removed done TODO --- test/pipelines/test_eval.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/pipelines/test_eval.py b/test/pipelines/test_eval.py index c0f7ec5193..ff209a3b9c 100644 --- a/test/pipelines/test_eval.py +++ b/test/pipelines/test_eval.py @@ -253,9 +253,6 @@ def test_eval_data_split_word(document_store): assert len(set(labels[0].document_ids)) == 2 -# TODO Add test for TableQA eval - - @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True) def test_eval_data_split_passage(document_store): # splitting by passage From 0cacc798683377b284d0b17e20e0f4fb9ccbc7bd Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Thu, 13 Apr 2023 16:54:57 +0200 Subject: [PATCH 22/32] Add boolean to toggle between Span and TableCell --- haystack/nodes/reader/table.py | 75 +++++++++++++++++++++++++++++++--- test/conftest.py | 7 ++-- 2 files changed, 74 insertions(+), 8 deletions(-) diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index 78060fe67c..22040987dd 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -25,7 +25,7 @@ from transformers.models.tapas.modeling_tapas import TapasPreTrainedModel from haystack.errors import HaystackError -from haystack.schema import Document, Answer, TableCell +from haystack.schema import Document, Answer, TableCell, Span from haystack.nodes.reader.base import BaseReader from haystack.modeling.utils import initialize_device_settings @@ -71,6 +71,7 @@ def __init__( max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None, + return_table_cell: bool = False, ): """ Load a TableQA model from Transformers. @@ -112,6 +113,11 @@ def __init__( A list containing torch device objects and/or strings is supported (For example [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices parameter is not used and a single cpu device is used for inference. + :param return_table_cell: Whether to return the offsets (`offsets_in_document` and `offsets_in_context`) that indicate + the cells that answer the question using the TableCell schema. The TableCell schema returns the row + and column indices of the table cells selected in the Answer. Otherwise, the offsets + are returned as Span objects which are start and end indices when counting through the + table in a linear fashion i.e. first cell is top left and last cell is bottom right. """ super().__init__() @@ -133,6 +139,7 @@ def __init__( tokenizer=tokenizer, max_seq_len=max_seq_len, use_auth_token=use_auth_token, + return_table_cell=return_table_cell, ) elif config.architectures[0] == "TapasForScoredQA": self.table_encoder = _TapasScoredEncoder( @@ -144,6 +151,7 @@ def __init__( return_no_answer=return_no_answer, max_seq_len=max_seq_len, use_auth_token=use_auth_token, + return_table_cell=return_table_cell, ) else: logger.error( @@ -238,6 +246,7 @@ def __init__( tokenizer: Optional[str] = None, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, + return_table_cell: bool = False, ): self.model = TapasForQuestionAnswering.from_pretrained( model_name_or_path, revision=model_version, use_auth_token=use_auth_token @@ -259,6 +268,7 @@ def __init__( framework="pt", batch_size=1, # batch_size of 1 only works currently b/c of issue with HuggingFace pipeline logic and the return type of TableQuestionAnsweringPipeline._forward device=self.device, + return_table_cell=return_table_cell, ) def predict(self, query: str, documents: List[Document], top_k: int) -> Dict: @@ -413,8 +423,11 @@ def postprocess(self, model_outputs): answer_str = ", ".join(cells) else: answer_str = self._aggregate_answers(aggregator, cells) - answer_offsets = _calculate_answer_offsets(ans_coordinates_per_table) current_score = answer_scores[index] + if self.return_table_cell: + answer_offsets = _calculate_answer_offsets(ans_coordinates_per_table) + else: + answer_offsets = _calculate_answer_offsets(ans_coordinates_per_table) answer = Answer( answer=answer_str, type="extractive", @@ -445,6 +458,7 @@ def __init__( return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, + return_table_cell: bool = False, ): self.model = self._TapasForScoredQA.from_pretrained( model_name_or_path, revision=model_version, use_auth_token=use_auth_token @@ -457,6 +471,7 @@ def __init__( self.device = device self.top_k_per_candidate = top_k_per_candidate self.return_no_answer = return_no_answer + self.return_table_cell = return_table_cell def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tuple[List[Answer], float]: orig_table: pd.DataFrame = document.content @@ -537,7 +552,10 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu for answer_span_idx in top_k_answer_spans.indices: current_answer_span = possible_answer_spans[answer_span_idx] answer_str = string_table.iat[current_answer_span[:2]] - answer_offsets = _calculate_answer_offsets([current_answer_span[:2]]) + if self.return_table_cell: + answer_offsets = _calculate_answer_offsets([current_answer_span[:2]]) + else: + answer_offsets = _calculate_answer_offsets_span([current_answer_span[:2]], string_table) # As the general table score is more important for the final score, it is double weighted. current_score = ((2 * table_relevancy_prob) + span_logits_softmax[0, answer_span_idx].item()) / 3 @@ -574,8 +592,23 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict: no_answer_score = current_no_answer_score if self.return_no_answer: + if self.return_table_cell: + offsets_in_context = None + offsets_in_document = None + else: + offsets_in_context = [Span(start=0, end=0)] + offsets_in_document = [Span(start=0, end=0)] answers.append( - Answer(answer="", type="extractive", score=no_answer_score, context=None, document_ids=None, meta=None) + Answer( + answer="", + type="extractive", + score=no_answer_score, + context=None, + offsets_in_context=offsets_in_context, + offsets_in_document=offsets_in_document, + document_ids=None, + meta=None, + ) ) answers = sorted(answers, reverse=True) @@ -640,6 +673,7 @@ def __init__( top_k: int = 10, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, + return_table_cell: bool = False, ): """ Load an RCI model from Transformers. @@ -668,6 +702,11 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param return_table_cell: Whether to return the offsets (`offsets_in_document` and `offsets_in_context`) that indicate + the cells that answer the question using the TableCell schema. The TableCell schema returns the row + and column indices of the table cells selected in the Answer. Otherwise, the offsets + are returned as Span objects which are start and end indices when counting through the + table in a linear fashion i.e. first cell is top left and last cell is bottom right. """ super().__init__() @@ -713,6 +752,7 @@ def __init__( self.top_k = top_k self.max_seq_len = max_seq_len self.return_no_answers = False + self.return_table_cell = return_table_cell def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict: """ @@ -778,7 +818,10 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = cell_scores_table[-1].append(current_cell_score) answer_str = string_table.iloc[row_idx, col_idx] - answer_offsets = TableCell(row=row_idx, col=col_idx) + if self.return_table_cell: + answer_offsets = TableCell(row=row_idx, col=col_idx) + else: + answer_offsets = self._calculate_answer_offsets_span(row_idx, col_idx, string_table) current_answers.append( Answer( answer=answer_str, @@ -821,6 +864,13 @@ def _create_row_column_representations(table: pd.DataFrame) -> Tuple[List[str], return row_reps, column_reps + @staticmethod + def _calculate_answer_offsets_span(row_idx, column_index, table) -> Span: + _, n_columns = table.shape + answer_cell_offset = (row_idx * n_columns) + column_index + + return Span(start=answer_cell_offset, end=answer_cell_offset + 1) + def predict_batch( self, queries: List[str], @@ -864,6 +914,21 @@ def _calculate_answer_offsets(answer_coordinates: List[Tuple[int, int]]) -> List return answer_offsets +def _calculate_answer_offsets_span(answer_coordinates: List[Tuple[int, int]], table: pd.DataFrame) -> List[Span]: + """ + Calculates the answer cell offsets of the linearized table based on the answer cell coordinates. + + :param answer_coordinates: List of answer coordinates. + :param table: Table containing the answers in answer coordinates. + """ + answer_offsets = [] + _, n_columns = table.shape + for coord in answer_coordinates: + answer_cell_offset = (coord[0] * n_columns) + coord[1] + answer_offsets.append(Span(start=answer_cell_offset, end=answer_cell_offset + 1)) + return answer_offsets + + def _check_documents(documents: List[Document]) -> List[Document]: """ Check that the content type of all `documents` is of type 'table' otherwise remove that document from the list. diff --git a/test/conftest.py b/test/conftest.py index 487d7188a1..b5140f133d 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -510,16 +510,17 @@ def reader(request): @pytest.fixture(params=["tapas_small", "tapas_base", "tapas_scored", "rci"]) def table_reader_and_param(request): if request.param == "tapas_small": - return TableReader(model_name_or_path="google/tapas-small-finetuned-wtq"), request.param + return TableReader(model_name_or_path="google/tapas-small-finetuned-wtq", return_table_cell=True), request.param elif request.param == "tapas_base": - return TableReader(model_name_or_path="google/tapas-base-finetuned-wtq"), request.param + return TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", return_table_cell=True), request.param elif request.param == "tapas_scored": - return TableReader(model_name_or_path="deepset/tapas-large-nq-hn-reader"), request.param + return TableReader(model_name_or_path="deepset/tapas-large-nq-hn-reader", return_table_cell=True), request.param elif request.param == "rci": return ( RCIReader( row_model_name_or_path="michaelrglass/albert-base-rci-wikisql-row", column_model_name_or_path="michaelrglass/albert-base-rci-wikisql-col", + return_table_cell=True, ), request.param, ) From f017c59e9bf22cb80cb17a3da47cd04cdbca4e54 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Fri, 14 Apr 2023 08:52:19 +0200 Subject: [PATCH 23/32] Boolean toggle for turning on table cell now works. --- haystack/nodes/reader/table.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index 22040987dd..bc3bb3e2e5 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -310,6 +310,10 @@ def predict_batch(self, queries: List[str], documents: List[List[Document]], top class _TableQuestionAnsweringPipeline(TableQuestionAnsweringPipeline): """Modified from transformers TableQuestionAnsweringPipeline.postprocess to return Haystack Answer objects.""" + def __init__(self, return_table_cell: bool = False, *args, **kwargs): + super().__init__(*args, **kwargs) + self.return_table_cell = return_table_cell + def _calculate_answer_score( self, logits: torch.Tensor, inputs: Dict, answer_coordinates: List[List[Tuple[int, int]]] ) -> List[Optional[np.ndarray]]: @@ -427,7 +431,7 @@ def postprocess(self, model_outputs): if self.return_table_cell: answer_offsets = _calculate_answer_offsets(ans_coordinates_per_table) else: - answer_offsets = _calculate_answer_offsets(ans_coordinates_per_table) + answer_offsets = _calculate_answer_offsets_span(ans_coordinates_per_table, string_table) answer = Answer( answer=answer_str, type="extractive", From 839576f6cd2ba52982d153e0b7d38c652f35afa9 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Fri, 14 Apr 2023 08:54:09 +0200 Subject: [PATCH 24/32] Fixing pylint --- haystack/nodes/reader/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index bc3bb3e2e5..d3282ec7e0 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -310,7 +310,7 @@ def predict_batch(self, queries: List[str], documents: List[List[Document]], top class _TableQuestionAnsweringPipeline(TableQuestionAnsweringPipeline): """Modified from transformers TableQuestionAnsweringPipeline.postprocess to return Haystack Answer objects.""" - def __init__(self, return_table_cell: bool = False, *args, **kwargs): + def __init__(self, *args, return_table_cell: bool = False, **kwargs): super().__init__(*args, **kwargs) self.return_table_cell = return_table_cell From 7c08165f0926522089416daeb6a4ae7756addc3f Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Fri, 14 Apr 2023 09:00:28 +0200 Subject: [PATCH 25/32] Mypy fixes --- haystack/nodes/reader/table.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index d3282ec7e0..4049edbf39 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -556,6 +556,7 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu for answer_span_idx in top_k_answer_spans.indices: current_answer_span = possible_answer_spans[answer_span_idx] answer_str = string_table.iat[current_answer_span[:2]] + answer_offsets: Union[List[Span], List[TableCell]] if self.return_table_cell: answer_offsets = _calculate_answer_offsets([current_answer_span[:2]]) else: @@ -822,18 +823,19 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = cell_scores_table[-1].append(current_cell_score) answer_str = string_table.iloc[row_idx, col_idx] + answer_offsets: Union[List[Span], List[TableCell]] if self.return_table_cell: - answer_offsets = TableCell(row=row_idx, col=col_idx) + answer_offsets = [TableCell(row=row_idx, col=col_idx)] else: - answer_offsets = self._calculate_answer_offsets_span(row_idx, col_idx, string_table) + answer_offsets = [self._calculate_answer_offsets_span(row_idx, col_idx, string_table)] current_answers.append( Answer( answer=answer_str, type="extractive", score=current_cell_score, context=string_table, - offsets_in_document=[answer_offsets], - offsets_in_context=[answer_offsets], + offsets_in_document=answer_offsets, + offsets_in_context=answer_offsets, document_ids=[document.id], ) ) From c9b9aeba232600b5367ccd370df368f11269cf84 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Fri, 14 Apr 2023 09:13:34 +0200 Subject: [PATCH 26/32] Add deprecation message --- haystack/nodes/reader/table.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index 4049edbf39..eba1d9a04a 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -114,13 +114,22 @@ def __init__( [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices parameter is not used and a single cpu device is used for inference. :param return_table_cell: Whether to return the offsets (`offsets_in_document` and `offsets_in_context`) that indicate - the cells that answer the question using the TableCell schema. The TableCell schema returns the row + the table cells that answer the question using the TableCell schema. The TableCell schema returns the row and column indices of the table cells selected in the Answer. Otherwise, the offsets are returned as Span objects which are start and end indices when counting through the table in a linear fashion i.e. first cell is top left and last cell is bottom right. """ super().__init__() + if not return_table_cell: + logger.warning( + "The support for returning offsets in answer predictions in a linear fashion is being deprecated." + " Set return_table_cell=True to use the new offsets format which returns the row and column indices" + " of the table cells selected in the answer." + " In the future return_table_cell=True will become default and return_table_cell=False will no " + " longer be supported." + ) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) if len(self.devices) > 1: logger.warning( @@ -715,6 +724,15 @@ def __init__( """ super().__init__() + if not return_table_cell: + logger.warning( + "The support for returning offsets in answer predictions in a linear fashion is being deprecated." + " Set return_table_cell=True to use the new offsets format which returns the row and column indices" + " of the table cells selected in the answer." + " In the future return_table_cell=True will become default and return_table_cell=False will no " + " longer be supported." + ) + self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) if len(self.devices) > 1: logger.warning( From 4811400afc8dd3b949cba6ccf367ae2416430c2b Mon Sep 17 00:00:00 2001 From: agnieszka-m Date: Mon, 17 Apr 2023 08:50:46 +0200 Subject: [PATCH 27/32] Minor lg changes --- haystack/nodes/reader/table.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index eba1d9a04a..85e6674bb2 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -117,7 +117,7 @@ def __init__( the table cells that answer the question using the TableCell schema. The TableCell schema returns the row and column indices of the table cells selected in the Answer. Otherwise, the offsets are returned as Span objects which are start and end indices when counting through the - table in a linear fashion i.e. first cell is top left and last cell is bottom right. + table in a linear fashion, which means the first cell is top left and the last cell is bottom right. """ super().__init__() @@ -126,7 +126,7 @@ def __init__( "The support for returning offsets in answer predictions in a linear fashion is being deprecated." " Set return_table_cell=True to use the new offsets format which returns the row and column indices" " of the table cells selected in the answer." - " In the future return_table_cell=True will become default and return_table_cell=False will no " + " In the future, return_table_cell=True will become default and return_table_cell=False will no " " longer be supported." ) @@ -720,7 +720,7 @@ def __init__( the cells that answer the question using the TableCell schema. The TableCell schema returns the row and column indices of the table cells selected in the Answer. Otherwise, the offsets are returned as Span objects which are start and end indices when counting through the - table in a linear fashion i.e. first cell is top left and last cell is bottom right. + table in a linear fashion, which means the first cell is top left and the last cell is bottom right. """ super().__init__() @@ -729,7 +729,7 @@ def __init__( "The support for returning offsets in answer predictions in a linear fashion is being deprecated." " Set return_table_cell=True to use the new offsets format which returns the row and column indices" " of the table cells selected in the answer." - " In the future return_table_cell=True will become default and return_table_cell=False will no " + " In the future, return_table_cell=True will become default and return_table_cell=False will no " " longer be supported." ) From ff2a2994801b33dea379f14160aa4383f5ecb94e Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Tue, 18 Apr 2023 16:38:20 +0200 Subject: [PATCH 28/32] Deduplicate code --- haystack/schema.py | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index e6ec7ec537..1fc2605d3d 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -393,28 +393,10 @@ def __post_init__(self): # In case offsets are passed as dicts rather than Span or TableCell objects we convert them here # For example, this is used when instantiating an object via from_json() if self.offsets_in_document is not None: - offsets_in_document = [] - for e in self.offsets_in_document: - if isinstance(e, dict): - if "row" in e: # is a TableCell - offsets_in_document.append(TableCell(**e)) - else: - offsets_in_document.append(Span(**e)) - else: - offsets_in_document.append(e) - self.offsets_in_document = offsets_in_document + self.offsets_in_document = self._convert_offsets(self.offsets_in_document) if self.offsets_in_context is not None: - offsets_in_context = [] - for e in self.offsets_in_context: - if isinstance(e, dict): - if "row" in e: # is a TableCell - offsets_in_context.append(TableCell(**e)) - else: - offsets_in_context.append(Span(**e)) - else: - offsets_in_context.append(e) - self.offsets_in_context = offsets_in_context + self.offsets_in_context = self._convert_offsets(self.offsets_in_context) if self.meta is None: self.meta = {} @@ -454,6 +436,19 @@ def from_json(cls, data): data = json.loads(data) return cls.from_dict(data) + @staticmethod + def _convert_offsets(offsets): + converted_offsets = [] + for e in offsets: + if isinstance(e, dict): + if "row" in e: # is a TableCell + converted_offsets.append(TableCell(**e)) + else: + converted_offsets.append(Span(**e)) + else: + converted_offsets.append(e) + return converted_offsets + @dataclass class Label: From 2d38e708add04565120977d6e6702c77cad4e9f2 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Wed, 19 Apr 2023 10:20:27 +0200 Subject: [PATCH 29/32] Remove duplicate code and expand schema tests for multilabel --- haystack/schema.py | 28 +++++++++++++++------------- test/others/test_schema.py | 20 +++++++++++++++----- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 1fc2605d3d..154b1cff7a 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -393,10 +393,10 @@ def __post_init__(self): # In case offsets are passed as dicts rather than Span or TableCell objects we convert them here # For example, this is used when instantiating an object via from_json() if self.offsets_in_document is not None: - self.offsets_in_document = self._convert_offsets(self.offsets_in_document) + self.offsets_in_document = self._from_dict_offsets(self.offsets_in_document) if self.offsets_in_context is not None: - self.offsets_in_context = self._convert_offsets(self.offsets_in_context) + self.offsets_in_context = self._from_dict_offsets(self.offsets_in_context) if self.meta is None: self.meta = {} @@ -437,7 +437,7 @@ def from_json(cls, data): return cls.from_dict(data) @staticmethod - def _convert_offsets(offsets): + def _from_dict_offsets(offsets): converted_offsets = [] for e in offsets: if isinstance(e, dict): @@ -652,17 +652,9 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ self._offsets_in_contexts = [] for answer in answered: if answer.offsets_in_document is not None: - for span in answer.offsets_in_document: - if isinstance(span, TableCell): - self._offsets_in_documents.append({"row": span.row, "col": span.col}) - else: - self._offsets_in_documents.append({"start": span.start, "end": span.end}) + self._offsets_in_documents.extend(self._to_dict_offsets(answer)) if answer.offsets_in_context is not None: - for span in answer.offsets_in_context: - if isinstance(span, TableCell): - self._offsets_in_contexts.append({"row": span.row, "col": span.col}) - else: - self._offsets_in_contexts.append({"start": span.start, "end": span.end}) + self._offsets_in_contexts.extend(self._to_dict_offsets(answer)) # There are two options here to represent document_ids: # taking the id from the document of each label or taking the document_id of each label's answer. @@ -675,6 +667,16 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ self._document_ids = [l.document.id for l in self._labels if not l.no_answer] self._contexts = [str(l.document.content) for l in self._labels if not l.no_answer] + @staticmethod + def _to_dict_offsets(answer: Answer) -> List[Dict]: + converted_offsets = [] + for span in answer.offsets_in_document: + if isinstance(span, TableCell): + converted_offsets.append({"row": span.row, "col": span.col}) + else: + converted_offsets.append({"start": span.start, "end": span.end}) + return converted_offsets + @property def labels(self): return self._labels diff --git a/test/others/test_schema.py b/test/others/test_schema.py index 823b61c692..49e6a4ed27 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -420,7 +420,15 @@ def test_multilabel_preserve_order_w_duplicates(): multilabel = MultiLabel(labels=labels) - assert len(multilabel.document_ids) == 3 + assert multilabel.query == "question" + assert multilabel.answers == ["answer1", "answer2", "answer3"] + assert multilabel.document_ids == ["123", "123", "333"] + assert multilabel.contexts == ["some", "some", "some other"] + assert multilabel.offsets_in_documents == [ + {"start": 12, "end": 18}, + {"start": 12, "end": 18}, + {"start": 12, "end": 18}, + ] for i in range(0, 3): assert multilabel.labels[i].id == str(i) @@ -484,10 +492,12 @@ def test_multilabel_with_doc_containing_dataframes(): ), ) multilabel = MultiLabel(labels=[label]) - assert len(multilabel.contexts) == 1 - assert isinstance(multilabel.contexts[0], str) - assert multilabel.offsets_in_documents[0] == {"row": 0, "col": 0} - assert multilabel.offsets_in_contexts[0] == {"row": 0, "col": 0} + assert multilabel.query == "A question" + assert multilabel.contexts == [" col1 col2\n0 1 3\n1 2 4"] + assert multilabel.answers == ["1"] + assert multilabel.document_ids == ["table1"] + assert multilabel.offsets_in_documents == [{"row": 0, "col": 0}] + assert multilabel.offsets_in_contexts == [{"row": 0, "col": 0}] def test_multilabel_serialization(): From 786340a35ae8bad41e864e929ccb0ada88c2f625 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Wed, 19 Apr 2023 10:49:29 +0200 Subject: [PATCH 30/32] Fixed introduced bug and expanded tests to catch that in the future. --- haystack/schema.py | 19 +++++++++---------- test/others/test_schema.py | 35 ++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 154b1cff7a..46b198021c 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -652,9 +652,11 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ self._offsets_in_contexts = [] for answer in answered: if answer.offsets_in_document is not None: - self._offsets_in_documents.extend(self._to_dict_offsets(answer)) + for span in answer.offsets_in_document: + self._offsets_in_documents.append(self._to_dict_offsets(span)) if answer.offsets_in_context is not None: - self._offsets_in_contexts.extend(self._to_dict_offsets(answer)) + for span in answer.offsets_in_context: + self._offsets_in_contexts.append(self._to_dict_offsets(span)) # There are two options here to represent document_ids: # taking the id from the document of each label or taking the document_id of each label's answer. @@ -668,14 +670,11 @@ def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answ self._contexts = [str(l.document.content) for l in self._labels if not l.no_answer] @staticmethod - def _to_dict_offsets(answer: Answer) -> List[Dict]: - converted_offsets = [] - for span in answer.offsets_in_document: - if isinstance(span, TableCell): - converted_offsets.append({"row": span.row, "col": span.col}) - else: - converted_offsets.append({"start": span.start, "end": span.end}) - return converted_offsets + def _to_dict_offsets(offset: Union[Span, TableCell]) -> Dict: + if isinstance(offset, TableCell): + return {"row": offset.row, "col": offset.col} + else: + return {"start": offset.start, "end": offset.end} @property def labels(self): diff --git a/test/others/test_schema.py b/test/others/test_schema.py index 49e6a4ed27..dd1553a2f6 100644 --- a/test/others/test_schema.py +++ b/test/others/test_schema.py @@ -374,7 +374,11 @@ def test_multilabel_preserve_order_w_duplicates(): Label( id="0", query="question", - answer=Answer(answer="answer1", offsets_in_document=[Span(start=12, end=18)]), + answer=Answer( + answer="answer1", + offsets_in_document=[Span(start=12, end=18)], + offsets_in_context=[Span(start=1, end=7)], + ), document=Document(content="some", id="123"), is_correct_answer=True, is_correct_document=True, @@ -383,7 +387,11 @@ def test_multilabel_preserve_order_w_duplicates(): Label( id="1", query="question", - answer=Answer(answer="answer2", offsets_in_document=[Span(start=12, end=18)]), + answer=Answer( + answer="answer2", + offsets_in_document=[Span(start=10, end=16)], + offsets_in_context=[Span(start=0, end=6)], + ), document=Document(content="some", id="123"), is_correct_answer=True, is_correct_document=True, @@ -392,7 +400,11 @@ def test_multilabel_preserve_order_w_duplicates(): Label( id="2", query="question", - answer=Answer(answer="answer3", offsets_in_document=[Span(start=12, end=18)]), + answer=Answer( + answer="answer3", + offsets_in_document=[Span(start=14, end=20)], + offsets_in_context=[Span(start=2, end=8)], + ), document=Document(content="some other", id="333"), is_correct_answer=True, is_correct_document=True, @@ -401,7 +413,11 @@ def test_multilabel_preserve_order_w_duplicates(): Label( id="0", query="question", - answer=Answer(answer="answer1", offsets_in_document=[Span(start=12, end=18)]), + answer=Answer( + answer="answer1", + offsets_in_document=[Span(start=12, end=18)], + offsets_in_context=[Span(start=1, end=7)], + ), document=Document(content="some", id="123"), is_correct_answer=True, is_correct_document=True, @@ -410,7 +426,11 @@ def test_multilabel_preserve_order_w_duplicates(): Label( id="2", query="question", - answer=Answer(answer="answer3", offsets_in_document=[Span(start=12, end=18)]), + answer=Answer( + answer="answer3", + offsets_in_document=[Span(start=14, end=20)], + offsets_in_context=[Span(start=2, end=8)], + ), document=Document(content="some other", id="333"), is_correct_answer=True, is_correct_document=True, @@ -426,9 +446,10 @@ def test_multilabel_preserve_order_w_duplicates(): assert multilabel.contexts == ["some", "some", "some other"] assert multilabel.offsets_in_documents == [ {"start": 12, "end": 18}, - {"start": 12, "end": 18}, - {"start": 12, "end": 18}, + {"start": 10, "end": 16}, + {"start": 14, "end": 20}, ] + assert multilabel.offsets_in_contexts == [{"start": 1, "end": 7}, {"start": 0, "end": 6}, {"start": 2, "end": 8}] for i in range(0, 3): assert multilabel.labels[i].id == str(i) From 20eea485c2cc0e93c86ae9878e50f847d5e06aa8 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Wed, 19 Apr 2023 11:08:35 +0200 Subject: [PATCH 31/32] Add some unit tests --- test/nodes/test_table_reader.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/test/nodes/test_table_reader.py b/test/nodes/test_table_reader.py index a975444b5e..609207ebf6 100644 --- a/test/nodes/test_table_reader.py +++ b/test/nodes/test_table_reader.py @@ -1,12 +1,13 @@ import logging import pandas as pd -import torch import pytest -from haystack.schema import Document, Answer +from haystack.schema import Document, Answer, Span, TableCell from haystack.pipelines.base import Pipeline +from haystack.nodes.reader.table import _calculate_answer_offsets_span, _calculate_answer_offsets + @pytest.fixture def table_doc1(): @@ -39,6 +40,18 @@ def table_doc3(): return Document(content=pd.DataFrame(data), content_type="table", id="doc3") +@pytest.mark.unit +def test_calculate_answer_offsets_span(table_doc1): + offsets_span = _calculate_answer_offsets_span(answer_coordinates=[(0, 1), (1, 3)], table=table_doc1.content) + assert offsets_span == [Span(start=1, end=2), Span(start=7, end=8)] + + +@pytest.mark.unit +def test_calculate_answer_offsets_table_cell(table_doc1): + offsets_span = _calculate_answer_offsets(answer_coordinates=[(0, 1), (1, 3)]) + assert offsets_span == [TableCell(row=0, col=1), TableCell(row=1, col=3)] + + @pytest.mark.integration @pytest.mark.parametrize("table_reader_and_param", ["tapas_small", "rci", "tapas_scored"], indirect=True) def test_table_reader(table_reader_and_param, table_doc1, table_doc2): From 65204e7f34ac5356a40c84d8b085d23ebd7f6833 Mon Sep 17 00:00:00 2001 From: Sebastian Lee Date: Wed, 19 Apr 2023 12:21:12 +0200 Subject: [PATCH 32/32] Test that table answers work as responses in the rest API --- rest_api/test/test_rest_api.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/rest_api/test/test_rest_api.py b/rest_api/test/test_rest_api.py index 1b4c0cd786..e0e7a3b833 100644 --- a/rest_api/test/test_rest_api.py +++ b/rest_api/test/test_rest_api.py @@ -13,7 +13,7 @@ import pytest from fastapi.testclient import TestClient import posthog -from haystack import Document, Answer, Pipeline +from haystack import Document, Answer, Pipeline, TableCell import haystack from haystack.nodes import BaseReader, BaseRetriever from haystack.document_stores import BaseDocumentStore @@ -435,12 +435,27 @@ def test_query_with_dataframe(client): meta={"test_key": "test_value"}, ) ], + "answers": [ + Answer( + answer="text_2", + type="extractive", + score=0.95, + context=pd.DataFrame.from_records([{"col1": "text_1", "col2": 1}, {"col1": "text_2", "col2": 2}]), + offsets_in_document=[TableCell(1, 0)], + offsets_in_context=[TableCell(1, 0)], + meta={"aggregation_operator": "NONE", "answer_cells": ["text_2"]}, + ) + ], } response = client.post(url="/query", json={"query": TEST_QUERY}) assert 200 == response.status_code assert len(response.json()["documents"]) == 1 assert response.json()["documents"][0]["content"] == [["col1", "col2"], ["text_1", 1], ["text_2", 2]] assert response.json()["documents"][0]["content_type"] == "table" + assert len(response.json()["answers"]) == 1 + assert response.json()["answers"][0]["context"] == [["col1", "col2"], ["text_1", 1], ["text_2", 2]] + assert response.json()["answers"][0]["offsets_in_document"] == [{"row": 1, "col": 0}] + assert response.json()["answers"][0]["offsets_in_context"] == [{"row": 1, "col": 0}] # Ensure `run` was called with the expected parameters mocked_pipeline.run.assert_called_with(query=TEST_QUERY, params={}, debug=False)