Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Implementation of Table Cell Proposal #4616

Merged
merged 42 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
0363ef1
Starting adding support for TableCell
sjrl Mar 2, 2023
97fb284
Fixed typing error by adding correct import and started to update tes…
sjrl Mar 2, 2023
905edaa
Update tests to use row and col
sjrl Mar 2, 2023
d382f1d
Added TODO for adding a test for tableQA eval
sjrl Mar 2, 2023
13dbaa7
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Mar 7, 2023
eed82c4
Added schema test to check to_dict and from_dict works for Table docu…
sjrl Mar 7, 2023
ec96908
Updated Multilabel to use builtin document.to_dict() method to build …
sjrl Mar 7, 2023
ba44009
Added test for multilabel serialization for tables. Had to add List[L…
sjrl Mar 7, 2023
bfc751e
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Mar 7, 2023
bf2f1f6
Resolved error from merge
sjrl Mar 7, 2023
329e35e
Don't return offsets_in_* if it is a no answer
sjrl Mar 7, 2023
04778ff
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Mar 16, 2023
610e111
hanlde list of lists in context matching
sjrl Mar 16, 2023
2fd637e
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Mar 16, 2023
d643d05
Update eval test to use TableCell
sjrl Mar 16, 2023
019f340
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Mar 27, 2023
66b0cf2
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Mar 28, 2023
2852a31
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Apr 6, 2023
5ed4e24
Added more schema tests for table docs, labels and answers.
sjrl Apr 6, 2023
2427ae1
Remove unneccessary to_dict call
sjrl Apr 6, 2023
e3aceec
Expanding content type to List of Lists for Documents
sjrl Apr 6, 2023
6af25ff
Fixing mypy errors
sjrl Apr 6, 2023
0d321ff
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Apr 6, 2023
7507bf7
Fix pylint
sjrl Apr 6, 2023
acfcb36
Updating types
sjrl Apr 6, 2023
1fd10fe
Remove table multilabel serialization for now. Make changes in separa…
sjrl Apr 6, 2023
5e8a519
Undo change to MultiLabel
sjrl Apr 6, 2023
9259423
Fix test
sjrl Apr 6, 2023
79981f7
Removed done TODO
sjrl Apr 11, 2023
a5032d3
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Apr 11, 2023
210d8a4
Merge branch 'main' of github.com:deepset-ai/haystack into table_cell
sjrl Apr 13, 2023
0cacc79
Add boolean to toggle between Span and TableCell
sjrl Apr 13, 2023
f017c59
Boolean toggle for turning on table cell now works.
sjrl Apr 14, 2023
839576f
Fixing pylint
sjrl Apr 14, 2023
7c08165
Mypy fixes
sjrl Apr 14, 2023
c9b9aeb
Add deprecation message
sjrl Apr 14, 2023
4811400
Minor lg changes
agnieszka-m Apr 17, 2023
ff2a299
Deduplicate code
sjrl Apr 18, 2023
2d38e70
Remove duplicate code and expand schema tests for multilabel
sjrl Apr 19, 2023
786340a
Fixed introduced bug and expanded tests to catch that in the future.
sjrl Apr 19, 2023
20eea48
Add some unit tests
sjrl Apr 19, 2023
65204e7
Test that table answers work as responses in the rest API
sjrl Apr 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion haystack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pandas as pd

from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult
from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult, TableCell
from haystack.nodes.base import BaseComponent
from haystack.pipelines.base import Pipeline
from haystack.environment import set_pytorch_secure_model_loading
Expand Down
10 changes: 8 additions & 2 deletions haystack/nodes/reader/farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,12 @@ def eval(
if not label.document.id:
logger.error("Label does not contain a document id")
continue
if label.document.content_type == "table":
logger.warning(
"Label with a table document is not compatible with the FARMReader. Skipping label with id %s.",
label.id,
)
continue
aggregated_per_doc[label.document.id].append(label)

# Create squad style dicts
Expand Down Expand Up @@ -1101,7 +1107,7 @@ def eval(
)
continue
aggregated_per_question[aggregation_key]["answers"].append(
{"text": label.answer.answer, "answer_start": label.answer.offsets_in_document[0].start}
{"text": label.answer.answer, "answer_start": label.answer.offsets_in_document[0].start} # type: ignore [union-attr]
sjrl marked this conversation as resolved.
Show resolved Hide resolved
)
aggregated_per_question[aggregation_key]["is_impossible"] = False
# create new one
Expand All @@ -1121,7 +1127,7 @@ def eval(
"answers": [
{
"text": label.answer.answer,
"answer_start": label.answer.offsets_in_document[0].start,
"answer_start": label.answer.offsets_in_document[0].start, # type: ignore [union-attr]
}
],
"is_impossible": False,
Expand Down
90 changes: 80 additions & 10 deletions haystack/nodes/reader/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from transformers.models.tapas.modeling_tapas import TapasPreTrainedModel

from haystack.errors import HaystackError
from haystack.schema import Document, Answer, Span
from haystack.schema import Document, Answer, TableCell, Span
from haystack.nodes.reader.base import BaseReader
from haystack.modeling.utils import initialize_device_settings

Expand Down Expand Up @@ -71,6 +71,7 @@ def __init__(
max_seq_len: int = 256,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None,
return_table_cell: bool = False,
):
"""
Load a TableQA model from Transformers.
Expand Down Expand Up @@ -112,9 +113,23 @@ def __init__(
A list containing torch device objects and/or strings is supported (For example
[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
parameter is not used and a single cpu device is used for inference.
:param return_table_cell: Whether to return the offsets (`offsets_in_document` and `offsets_in_context`) that indicate
the table cells that answer the question using the TableCell schema. The TableCell schema returns the row
and column indices of the table cells selected in the Answer. Otherwise, the offsets
are returned as Span objects which are start and end indices when counting through the
table in a linear fashion, which means the first cell is top left and the last cell is bottom right.
"""
super().__init__()

if not return_table_cell:
logger.warning(
"The support for returning offsets in answer predictions in a linear fashion is being deprecated."
" Set return_table_cell=True to use the new offsets format which returns the row and column indices"
" of the table cells selected in the answer."
" In the future, return_table_cell=True will become default and return_table_cell=False will no "
" longer be supported."
)

self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
Expand All @@ -133,6 +148,7 @@ def __init__(
tokenizer=tokenizer,
max_seq_len=max_seq_len,
use_auth_token=use_auth_token,
return_table_cell=return_table_cell,
)
elif config.architectures[0] == "TapasForScoredQA":
self.table_encoder = _TapasScoredEncoder(
Expand All @@ -144,6 +160,7 @@ def __init__(
return_no_answer=return_no_answer,
max_seq_len=max_seq_len,
use_auth_token=use_auth_token,
return_table_cell=return_table_cell,
)
else:
logger.error(
Expand Down Expand Up @@ -238,6 +255,7 @@ def __init__(
tokenizer: Optional[str] = None,
max_seq_len: int = 256,
use_auth_token: Optional[Union[str, bool]] = None,
return_table_cell: bool = False,
):
self.model = TapasForQuestionAnswering.from_pretrained(
model_name_or_path, revision=model_version, use_auth_token=use_auth_token
Expand All @@ -259,6 +277,7 @@ def __init__(
framework="pt",
batch_size=1, # batch_size of 1 only works currently b/c of issue with HuggingFace pipeline logic and the return type of TableQuestionAnsweringPipeline._forward
device=self.device,
return_table_cell=return_table_cell,
)

def predict(self, query: str, documents: List[Document], top_k: int) -> Dict:
Expand Down Expand Up @@ -300,6 +319,10 @@ def predict_batch(self, queries: List[str], documents: List[List[Document]], top
class _TableQuestionAnsweringPipeline(TableQuestionAnsweringPipeline):
"""Modified from transformers TableQuestionAnsweringPipeline.postprocess to return Haystack Answer objects."""

def __init__(self, *args, return_table_cell: bool = False, **kwargs):
super().__init__(*args, **kwargs)
self.return_table_cell = return_table_cell

def _calculate_answer_score(
self, logits: torch.Tensor, inputs: Dict, answer_coordinates: List[List[Tuple[int, int]]]
) -> List[Optional[np.ndarray]]:
Expand Down Expand Up @@ -413,8 +436,11 @@ def postprocess(self, model_outputs):
answer_str = ", ".join(cells)
else:
answer_str = self._aggregate_answers(aggregator, cells)
answer_offsets = _calculate_answer_offsets(ans_coordinates_per_table, string_table)
current_score = answer_scores[index]
if self.return_table_cell:
answer_offsets = _calculate_answer_offsets(ans_coordinates_per_table)
else:
answer_offsets = _calculate_answer_offsets_span(ans_coordinates_per_table, string_table)
answer = Answer(
answer=answer_str,
type="extractive",
Expand Down Expand Up @@ -445,6 +471,7 @@ def __init__(
return_no_answer: bool = False,
max_seq_len: int = 256,
use_auth_token: Optional[Union[str, bool]] = None,
return_table_cell: bool = False,
):
self.model = self._TapasForScoredQA.from_pretrained(
model_name_or_path, revision=model_version, use_auth_token=use_auth_token
Expand All @@ -457,6 +484,7 @@ def __init__(
self.device = device
self.top_k_per_candidate = top_k_per_candidate
self.return_no_answer = return_no_answer
self.return_table_cell = return_table_cell

def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tuple[List[Answer], float]:
orig_table: pd.DataFrame = document.content
Expand Down Expand Up @@ -537,7 +565,11 @@ def _predict_tapas_scored(self, inputs: BatchEncoding, document: Document) -> Tu
for answer_span_idx in top_k_answer_spans.indices:
current_answer_span = possible_answer_spans[answer_span_idx]
answer_str = string_table.iat[current_answer_span[:2]]
answer_offsets = _calculate_answer_offsets([current_answer_span[:2]], string_table)
answer_offsets: Union[List[Span], List[TableCell]]
if self.return_table_cell:
answer_offsets = _calculate_answer_offsets([current_answer_span[:2]])
else:
answer_offsets = _calculate_answer_offsets_span([current_answer_span[:2]], string_table)
# As the general table score is more important for the final score, it is double weighted.
current_score = ((2 * table_relevancy_prob) + span_logits_softmax[0, answer_span_idx].item()) / 3

Expand Down Expand Up @@ -574,14 +606,20 @@ def predict(self, query: str, documents: List[Document], top_k: int) -> Dict:
no_answer_score = current_no_answer_score

if self.return_no_answer:
if self.return_table_cell:
offsets_in_context = None
offsets_in_document = None
else:
offsets_in_context = [Span(start=0, end=0)]
offsets_in_document = [Span(start=0, end=0)]
sjrl marked this conversation as resolved.
Show resolved Hide resolved
answers.append(
Answer(
answer="",
type="extractive",
score=no_answer_score,
context=None,
offsets_in_context=[Span(start=0, end=0)],
offsets_in_document=[Span(start=0, end=0)],
offsets_in_context=offsets_in_context,
offsets_in_document=offsets_in_document,
document_ids=None,
meta=None,
)
Expand Down Expand Up @@ -649,6 +687,7 @@ def __init__(
top_k: int = 10,
max_seq_len: int = 256,
use_auth_token: Optional[Union[str, bool]] = None,
return_table_cell: bool = False,
):
"""
Load an RCI model from Transformers.
Expand Down Expand Up @@ -677,9 +716,23 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
:param return_table_cell: Whether to return the offsets (`offsets_in_document` and `offsets_in_context`) that indicate
the cells that answer the question using the TableCell schema. The TableCell schema returns the row
and column indices of the table cells selected in the Answer. Otherwise, the offsets
are returned as Span objects which are start and end indices when counting through the
table in a linear fashion, which means the first cell is top left and the last cell is bottom right.
"""
super().__init__()

if not return_table_cell:
logger.warning(
"The support for returning offsets in answer predictions in a linear fashion is being deprecated."
" Set return_table_cell=True to use the new offsets format which returns the row and column indices"
" of the table cells selected in the answer."
" In the future, return_table_cell=True will become default and return_table_cell=False will no "
" longer be supported."
)

self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
if len(self.devices) > 1:
logger.warning(
Expand Down Expand Up @@ -722,6 +775,7 @@ def __init__(
self.top_k = top_k
self.max_seq_len = max_seq_len
self.return_no_answers = False
self.return_table_cell = return_table_cell

def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict:
"""
Expand Down Expand Up @@ -787,15 +841,19 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
cell_scores_table[-1].append(current_cell_score)

answer_str = string_table.iloc[row_idx, col_idx]
answer_offsets = self._calculate_answer_offsets(row_idx, col_idx, string_table)
answer_offsets: Union[List[Span], List[TableCell]]
if self.return_table_cell:
answer_offsets = [TableCell(row=row_idx, col=col_idx)]
else:
answer_offsets = [self._calculate_answer_offsets_span(row_idx, col_idx, string_table)]
current_answers.append(
Answer(
answer=answer_str,
type="extractive",
score=current_cell_score,
context=string_table,
offsets_in_document=[answer_offsets],
offsets_in_context=[answer_offsets],
offsets_in_document=answer_offsets,
offsets_in_context=answer_offsets,
document_ids=[document.id],
)
)
Expand Down Expand Up @@ -831,7 +889,7 @@ def _create_row_column_representations(table: pd.DataFrame) -> Tuple[List[str],
return row_reps, column_reps

@staticmethod
def _calculate_answer_offsets(row_idx, column_index, table) -> Span:
def _calculate_answer_offsets_span(row_idx, column_index, table) -> Span:
_, n_columns = table.shape
answer_cell_offset = (row_idx * n_columns) + column_index

Expand Down Expand Up @@ -868,7 +926,19 @@ def predict_batch(
return results


def _calculate_answer_offsets(answer_coordinates: List[Tuple[int, int]], table: pd.DataFrame) -> List[Span]:
def _calculate_answer_offsets(answer_coordinates: List[Tuple[int, int]]) -> List[TableCell]:
"""
Calculates the answer cell offsets of the linearized table based on the answer cell coordinates.

:param answer_coordinates: List of answer coordinates.
"""
answer_offsets = []
for coord in answer_coordinates:
answer_offsets.append(TableCell(row=coord[0], col=coord[1]))
return answer_offsets


def _calculate_answer_offsets_span(answer_coordinates: List[Tuple[int, int]], table: pd.DataFrame) -> List[Span]:
"""
Calculates the answer cell offsets of the linearized table based on the answer cell coordinates.

Expand Down
14 changes: 10 additions & 4 deletions haystack/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1621,8 +1621,10 @@ def _build_eval_dataframe(
df_answers["gold_contexts_similarity"] = df_answers.map_rows(
lambda row: [
calculate_context_similarity(
str(gold_context), # could be dataframe
str(row["context"]) if row["context"] is not None else "", # could be dataframe
str(gold_context), # could be dataframe or list of lists
str(row["context"])
if row["context"] is not None
else "", # could be dataframe or list of lists
min_length=context_matching_min_length,
boost_split_overlaps=context_matching_boost_split_overlaps,
)
Expand Down Expand Up @@ -1759,9 +1761,13 @@ def _build_eval_dataframe(
df_docs["gold_contexts_similarity"] = df_docs.map_rows(
lambda row: [
calculate_context_similarity(
str(gold_context) if isinstance(gold_context, pd.DataFrame) else gold_context,
str(gold_context)
if isinstance(gold_context, (pd.DataFrame, list))
else gold_context, # could be dataframe or list of lists
str(row["context"])
if isinstance(row["context"], pd.DataFrame)
if isinstance(
row["context"], (pd.DataFrame, list)
) # could be dataframe or list of lists
else row["context"] or "",
min_length=context_matching_min_length,
boost_split_overlaps=context_matching_boost_split_overlaps,
Expand Down
Loading