Fix/get rid of related files (#152)

* check if file with gen text is present on server instead of writing links to bot_attrs * old response = None if there is no file on server
deeppavlov · Dec 7, 2023 · 1cd4b66 · 1cd4b66
1 parent 4436e8c
commit 1cd4b66
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 73 deletions.
diff --git a/skills/dff_meeting_analysis_skill/scenario/response.py b/skills/dff_meeting_analysis_skill/scenario/response.py
@@ -84,7 +84,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
 
         dialog = int_ctx.get_dialog(ctx, actor)
         context = dialog.get("utterances", [])[-N_UTTERANCES_CONTEXT:]
-        related_files = {}
         prompt_type_local = prompt_type
 
         if context:
@@ -99,13 +98,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
             documents_in_use = user_attributes.get("documents_in_use", [])
             docs_combination_ids = user_attributes.get("documents_combination_ids", {})
             all_docs_info = user_attributes.get("processed_documents", {})
-            # related_files will be removed in next pr
-            related_files = (
-                int_ctx.get_last_bot_utterance(ctx, actor)
-                .get("user", {})
-                .get("attributes", {})
-                .get("related_files", {})
-            )
             hyps_and_names_all_docs = []
             if documents_in_use:
                 # try to find hypothesis generated for this request & these files earlier
@@ -114,34 +106,31 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
                     prompt_type_local=prompt_type_local,
                     documents_in_use=documents_in_use,
                     docs_combination_ids=docs_combination_ids,
-                    related_files=related_files,
                 )
                 # if no final hypothesis for this request was found
                 if not hypotheses:
                     # for each doc_in_use get a separate hypothesis
                     for document_in_use_id in documents_in_use:
-                        prompt_type_local, hyp_and_name_one_doc, related_files = get_hyp_and_filename_for_one_doc(
+                        prompt_type_local, hyp_and_name_one_doc = get_hyp_and_filename_for_one_doc(
                             request=request,
                             prompt_type_local=prompt_type_local,
                             all_docs_info=all_docs_info,
                             document_in_use_id=document_in_use_id,
                             dialog_context=dialog_context,
                             sending_variables=sending_variables,
                             username=username,
-                            related_files=related_files,
                         )
                         hyps_and_names_all_docs += hyp_and_name_one_doc
                     # having got responses for all docs, let's ask the model to generate one response from it
                     # or return the final response if we have one document and one response
-                    hypotheses, related_files = postprocess_hyps_from_all_docs(
+                    hypotheses = postprocess_hyps_from_all_docs(
                         hyps_and_names_all_docs=hyps_and_names_all_docs,
                         prompt_type_local=prompt_type_local,
                         docs_combination_ids=docs_combination_ids,
                         documents_in_use=documents_in_use,
                         dialog_context=dialog_context,
                         sending_variables=sending_variables,
                         username=username,
-                        related_files=related_files,
                     )
             # if there are docs in human utt attributes, but no processed docs in use were found
             elif docs_in_attributes:
@@ -152,8 +141,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
         else:
             hypotheses = []
         logger.info(f"generated hypotheses: {hypotheses}")
-        # related_files will be removed in next pr
-        bot_attrs = {"related_files": related_files}
 
         for hyp in hypotheses:
             if prompt_type == "set_personal_tasks_into_tracker":
@@ -173,7 +160,7 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
                 confidence = LOW_CONFIDENCE
             else:
                 confidence = DEFAULT_CONFIDENCE
-            gathering_responses(hyp, confidence, {}, bot_attrs, _curr_attrs)
+            gathering_responses(hyp, confidence, {}, {}, _curr_attrs)
 
         if len(curr_responses) == 0:
             return ""

diff --git a/skills/dff_meeting_analysis_skill/scenario/utils.py b/skills/dff_meeting_analysis_skill/scenario/utils.py
@@ -73,14 +73,20 @@
         management_prompts_dict[key]["prompt_concatenate"] = prompt_dict["prompt_concatenate"]
 
 
-def get_older_gen_response(item_type_and_id, bot_attrs_files):
-    hypothesis_link = bot_attrs_files[item_type_and_id]
-    old_response = requests.get(hypothesis_link, timeout=FILE_SERVER_TIMEOUT).text
-    logger.info(f"Found and downloaded {item_type_and_id} generated earlier.")
+def search_for_prev_gen_response(item_type_and_id: str) -> str:
+    link_to_check = f"{FILE_SERVER_URL}/file?file={item_type_and_id}.txt"
+    old_response = requests.get(link_to_check, timeout=FILE_SERVER_TIMEOUT)
+    if old_response.ok:
+        old_response = old_response.text
+        logger.info(f"Found and downloaded {item_type_and_id} generated earlier.")
+    else:
+        old_response = None
+        logger.info(f"No earlier {item_type_and_id} found. Ignore the warning above, no error occured.")
+
     return old_response
 
 
-def set_correct_type_and_id(request, prompt_type_local, document_in_use_id=None):
+def set_correct_type_and_id(request: str, prompt_type_local: str, document_in_use_id=None) -> Tuple[str, str]:
     prompt_type_and_id = ""
 
     # for this response function, set_personal_tasks_into_tracker is equivalent to personal_future_tasks
@@ -182,13 +188,13 @@ def get_response_for_prompt_type(
     return all_gpt_responses[-1]
 
 
-def upload_generated_item_return_link(hypothesis: str, prompt_type_and_id: str):
+def upload_generated_item_return_link(hypothesis: str, prompt_type_and_id: str) -> str:
     # we do not upload question_answering as questions may vary
     # we do not upload combine_responses because combine_responses type is only used for internal processing
     # the response generated in combine_responses will be uploaded later at its original name
     uploaded_doc_link = ""
     if "combine_responses" not in prompt_type_and_id and "question_answering" not in prompt_type_and_id:
-        logger.info(f"Saving {prompt_type_and_id} to related_files.")
+        logger.info(f"Uploading {prompt_type_and_id} to server.")
         filename = f"{prompt_type_and_id}.txt"
         uploaded_doc_link = upload_document(hypothesis, filename, FILE_SERVER_URL, FILE_SERVER_TIMEOUT, type_ref="text")
     return uploaded_doc_link
@@ -199,10 +205,9 @@ def compose_and_upload_final_response(
     prompt_type_and_id: str,
     dialog_context: List[str],
     sending_variables: dict,
-    bot_attrs_files: dict,
     use_filenames: bool = True,
     username: str = None,
-) -> Tuple[List[str], dict]:
+) -> List[str]:
     # note that we are joining responses for all docs by a special character SEP_FOR_DOC_RESPONSES
     # when we are sending them to LLM, if we need to split the info into chunks, we
     # will do that by SEP_FOR_DOC_RESPONSES, not by newline
@@ -216,28 +221,24 @@ def compose_and_upload_final_response(
         prompt_type_and_id_for_processing = f"combine_responses__{prompt_type_and_id.split('__')[1]}"
     else:
         prompt_type_and_id_for_processing = prompt_type_and_id
-    hyp_combined, bot_attrs_files = get_and_upload_response_for_one_doc(
+    hyp_combined = get_and_upload_response_for_one_doc(
         hyps_from_all_docs,
         prompt_type_and_id_for_processing,
         dialog_context,
         sending_variables,
-        bot_attrs_files,
         username,
     )
-    uploaded_doc_link = upload_generated_item_return_link(hyp_combined, prompt_type_and_id)
-    if uploaded_doc_link:
-        bot_attrs_files[prompt_type_and_id] = uploaded_doc_link
-    return [hyp_combined], bot_attrs_files
+    upload_generated_item_return_link(hyp_combined, prompt_type_and_id)
+    return [hyp_combined]
 
 
 def get_and_upload_response_for_one_doc(
     orig_text: str,
     prompt_type_and_id: str,
     dialog_context: List[str],
     sending_variables: dict,
-    bot_attrs_files: dict,
     username: str = None,
-) -> Tuple[str, dict]:
+) -> str:
     prompt_type = prompt_type_and_id.split("__")[0]
     document_in_use_id = prompt_type_and_id.split("__")[1]
     # hard-coded limit: we preserve 1000 tokens for LLM answer
@@ -256,11 +257,10 @@ def get_and_upload_response_for_one_doc(
 
         for item in INCLUDE_INTO_REPORT:
             item_type_and_id = f"{item}__{document_in_use_id}"
-            if item_type_and_id in bot_attrs_files.keys():
-                part_of_report = get_older_gen_response(item_type_and_id, bot_attrs_files)
+            part_of_report = search_for_prev_gen_response(item_type_and_id)
+            if part_of_report:
                 hypothesis += f"{part_of_report}\n\n"
             else:
-                logger.info(f"No earlier {item_type_and_id} for full_report found.")
                 part_of_report = get_response_for_prompt_type(
                     transcript_chunks=transcript_chunks,
                     prompt_type=item,
@@ -269,9 +269,7 @@ def get_and_upload_response_for_one_doc(
                     username=username,
                     format_the_response=True,
                 )
-                uploaded_doc_link = upload_generated_item_return_link(part_of_report, item_type_and_id)
-                if uploaded_doc_link:
-                    bot_attrs_files[item_type_and_id] = uploaded_doc_link
+                upload_generated_item_return_link(part_of_report, item_type_and_id)
                 hypothesis += f"{part_of_report}\n\n"
         hypothesis = hypothesis.strip()
     else:
@@ -285,10 +283,8 @@ def get_and_upload_response_for_one_doc(
         )
 
     # we save each hyp to server under the name of the request and doc_in_use id
-    uploaded_doc_link = upload_generated_item_return_link(hypothesis, prompt_type_and_id)
-    if uploaded_doc_link:
-        bot_attrs_files[prompt_type_and_id] = uploaded_doc_link
-    return hypothesis, bot_attrs_files
+    upload_generated_item_return_link(hypothesis, prompt_type_and_id)
+    return hypothesis
 
 
 def get_name_and_text_from_file(transcript_link: str) -> Tuple[str, str]:
@@ -316,30 +312,17 @@ def get_username(last_human_uttr: dict) -> str:
     return username
 
 
-def search_for_gen_response_prompt_type(prompt_type_and_id: str, related_files: dict) -> List[str]:
-    older_gen_response = None
-    if prompt_type_and_id in related_files.keys():
-        older_gen_response = [get_older_gen_response(prompt_type_and_id, related_files)]
-
-    return older_gen_response
-
-
 def older_gen_response_for_request(
-    request: str, prompt_type_local: str, documents_in_use: List[str], docs_combination_ids: dict, related_files: dict
+    request: str, prompt_type_local: str, documents_in_use: List[str], docs_combination_ids: dict
 ) -> List[str]:
     older_gen_response = None
     # check if have final hypothesis for this request in case of multiple docs in use
     if len(documents_in_use) > 1:
         prompt_type_local, _ = set_correct_type_and_id(request, prompt_type_local)
         curr_combination_id = get_key_by_value(docs_combination_ids, documents_in_use)
         prompt_type_and_id = f"{prompt_type_local}__{curr_combination_id}"
-        older_gen_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files)
-    # check if have final hypothesis for this request in case of one doc in use
-    else:
-        prompt_type_local, prompt_type_and_id = set_correct_type_and_id(
-            request, prompt_type_local, document_in_use_id=documents_in_use[0]
-        )
-        older_gen_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files)
+        older_gen_response = [search_for_prev_gen_response(prompt_type_and_id)]
+    # in case we have one doc_in_use, it will be checked later anyway
     return older_gen_response
 
 
@@ -351,7 +334,6 @@ def postprocess_hyps_from_all_docs(
     dialog_context: List[str],
     sending_variables: dict,
     username: str,
-    related_files: dict,
 ) -> Tuple[List[str], dict]:
     if len(hyps_and_names_all_docs) == 1 and prompt_type_local != "weekly_report":
         hypotheses_init = [hyps_and_names_all_docs[0][1]]
@@ -364,19 +346,18 @@ def postprocess_hyps_from_all_docs(
             # now by default we are passing filenames to LLM together with hypothesis for each file
             # you can choose to pass only hypotheses (no filenames) by setting use_filenames=False
             # when calling compose_and_upload_final_response()
-            hypotheses_init, related_files = compose_and_upload_final_response(
+            hypotheses_init = compose_and_upload_final_response(
                 hyps_and_names_all_docs=hyps_and_names_all_docs,
                 prompt_type_and_id=prompt_type_and_id,
                 dialog_context=dialog_context,
                 sending_variables=sending_variables,
-                related_files=related_files,
                 username=username,
             )
         except Exception as e:
             sentry_sdk.capture_exception(e)
             logger.exception(e)
             hypotheses_init = []
-    return hypotheses_init, related_files
+    return hypotheses_init
 
 
 def get_hyp_and_filename_for_one_doc(
@@ -387,8 +368,7 @@ def get_hyp_and_filename_for_one_doc(
     dialog_context: List[str],
     sending_variables: dict,
     username: str,
-    related_files: dict,
-) -> Tuple[str, Tuple[str, str], dict]:
+) -> Tuple[str, Tuple[str, str]]:
     # if we need a weekly report, on this step we gather separate daily reports for each doc
     # also here we change the type of summary prompt based on summary length request
     prompt_type_local, prompt_type_and_id = set_correct_type_and_id(
@@ -398,23 +378,21 @@ def get_hyp_and_filename_for_one_doc(
     transcript_link = all_docs_info[document_in_use_id].get("processed_text_link", "")
     if transcript_link:
         # here we check if we already generated sth for the same request and the same doc
-        older_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files)
-        if older_response:
-            # in the future, it is better to store filenames in related_files
+        prev_response = search_for_prev_gen_response(prompt_type_and_id)
+        if prev_response:
+            # in the future, it is better to store filenames in somewhere TODO
             # to avoid extra requests to file server
             filename, _ = get_name_and_text_from_file(transcript_link)
-            hyp_and_name_one_doc = [(filename, older_response)]
+            hyp_and_name_one_doc = [(filename, prev_response)]
         # if no, let's generate it
         else:
-            logger.info(f"No earlier {prompt_type_and_id} found. Sending request to generative model.")
             try:
                 filename, orig_text = get_name_and_text_from_file(transcript_link)
-                hyp_one_doc, related_files = get_and_upload_response_for_one_doc(
+                hyp_one_doc = get_and_upload_response_for_one_doc(
                     orig_text=orig_text,
                     prompt_type_and_id=prompt_type_and_id,
                     dialog_context=dialog_context,
                     sending_variables=sending_variables,
-                    bot_attrs_files=related_files,
                     username=username,
                 )
                 hyp_and_name_one_doc = [(filename, hyp_one_doc)]
@@ -424,4 +402,4 @@ def get_hyp_and_filename_for_one_doc(
                 hyp_and_name_one_doc = []
     else:
         hyp_and_name_one_doc = []
-    return prompt_type_local, hyp_and_name_one_doc, related_files
+    return prompt_type_local, hyp_and_name_one_doc