From e84f8ed33305072b52cec481690301a4095d2c4a Mon Sep 17 00:00:00 2001 From: lalitpagaria Date: Fri, 30 Oct 2020 16:23:34 +0100 Subject: [PATCH] Removing warnings from the Haystack codebase. 1. Few warnings need fix in FARM 2. Can't remove warning from docx library. --- haystack/document_store/elasticsearch.py | 2 +- haystack/preprocessor/utils.py | 8 +++++--- haystack/reader/farm.py | 1 + haystack/retriever/dense.py | 6 ++---- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/haystack/document_store/elasticsearch.py b/haystack/document_store/elasticsearch.py index a1ce4e7793..3ccacadf6d 100644 --- a/haystack/document_store/elasticsearch.py +++ b/haystack/document_store/elasticsearch.py @@ -304,7 +304,7 @@ def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[s def update_document_meta(self, id: str, meta: Dict[str, str]): body = {"doc": meta} - self.client.update(index=self.index, doc_type="_doc", id=id, body=body, refresh=self.refresh_type) + self.client.update(index=self.index, id=id, body=body, refresh=self.refresh_type) def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int: index = index or self.index diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py index 685e4396b3..e57743e470 100644 --- a/haystack/preprocessor/utils.py +++ b/haystack/preprocessor/utils.py @@ -196,12 +196,14 @@ def tika_convert_files_to_dicts( last_para = '' for para in paras: para = para.strip() - if not para: continue + if not para: + continue # merge paragraphs to improve qa # merge this paragraph if less than 10 characters or 2 words # or this paragraph starts with a lower case and last paragraph does not end with a punctuation - if merge_short and len(para) < 10 or len(re.findall('\s+', para)) < 2 \ - or merge_lowercase and para and para[0].islower() and last_para and last_para[-1] not in '.?!"\'\]\)': + if merge_short and len(para) < 10 or len(re.findall(r'\s+', para)) < 2 \ + or merge_lowercase and para and para[0].islower() and last_para \ + and last_para[-1] not in r'.?!"\'\]\)': last_para += ' ' + para else: if last_para: diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index cb789c9651..00fdee0142 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -338,6 +338,7 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int] inputs.append(cur) # get answers from QA model + # TODO: Need fix in FARM's `to_dict` function of `QAInput` class predictions = self.inferencer.inference_from_objects( objects=inputs, return_json=False, multiprocessing_chunksize=1 ) diff --git a/haystack/retriever/dense.py b/haystack/retriever/dense.py index 216277d953..43468a798d 100644 --- a/haystack/retriever/dense.py +++ b/haystack/retriever/dense.py @@ -5,13 +5,9 @@ from pathlib import Path from tqdm import tqdm -from farm.infer import Inferencer - from haystack.document_store.base import BaseDocumentStore from haystack import Document -from haystack.document_store.elasticsearch import ElasticsearchDocumentStore from haystack.retriever.base import BaseRetriever -from haystack.retriever.sparse import logger from farm.infer import Inferencer from farm.modeling.tokenization import Tokenizer @@ -374,6 +370,8 @@ def embed(self, texts: Union[List[str], str]) -> List[np.array]: assert type(texts) == list, "Expecting a list of texts, i.e. create_embeddings(texts=['text1',...])" if self.model_format == "farm" or self.model_format == "transformers": + # TODO: FARM's `sample_to_features_text` need to fix following warning - + # tokenization_utils.py:460: FutureWarning: `is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead. emb = self.embedding_model.inference_from_dicts(dicts=[{"text": t} for t in texts]) # type: ignore emb = [(r["vec"]) for r in emb] elif self.model_format == "sentence_transformers":