diff --git a/skills/dff_meeting_analysis_skill/scenario/response.py b/skills/dff_meeting_analysis_skill/scenario/response.py index 1721e3a4d..f503a5f0a 100644 --- a/skills/dff_meeting_analysis_skill/scenario/response.py +++ b/skills/dff_meeting_analysis_skill/scenario/response.py @@ -84,7 +84,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr): dialog = int_ctx.get_dialog(ctx, actor) context = dialog.get("utterances", [])[-N_UTTERANCES_CONTEXT:] - related_files = {} prompt_type_local = prompt_type if context: @@ -99,13 +98,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr): documents_in_use = user_attributes.get("documents_in_use", []) docs_combination_ids = user_attributes.get("documents_combination_ids", {}) all_docs_info = user_attributes.get("processed_documents", {}) - # related_files will be removed in next pr - related_files = ( - int_ctx.get_last_bot_utterance(ctx, actor) - .get("user", {}) - .get("attributes", {}) - .get("related_files", {}) - ) hyps_and_names_all_docs = [] if documents_in_use: # try to find hypothesis generated for this request & these files earlier @@ -114,13 +106,12 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr): prompt_type_local=prompt_type_local, documents_in_use=documents_in_use, docs_combination_ids=docs_combination_ids, - related_files=related_files, ) # if no final hypothesis for this request was found if not hypotheses: # for each doc_in_use get a separate hypothesis for document_in_use_id in documents_in_use: - prompt_type_local, hyp_and_name_one_doc, related_files = get_hyp_and_filename_for_one_doc( + prompt_type_local, hyp_and_name_one_doc = get_hyp_and_filename_for_one_doc( request=request, prompt_type_local=prompt_type_local, all_docs_info=all_docs_info, @@ -128,12 +119,11 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr): dialog_context=dialog_context, sending_variables=sending_variables, username=username, - related_files=related_files, ) hyps_and_names_all_docs += hyp_and_name_one_doc # having got responses for all docs, let's ask the model to generate one response from it # or return the final response if we have one document and one response - hypotheses, related_files = postprocess_hyps_from_all_docs( + hypotheses = postprocess_hyps_from_all_docs( hyps_and_names_all_docs=hyps_and_names_all_docs, prompt_type_local=prompt_type_local, docs_combination_ids=docs_combination_ids, @@ -141,7 +131,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr): dialog_context=dialog_context, sending_variables=sending_variables, username=username, - related_files=related_files, ) # if there are docs in human utt attributes, but no processed docs in use were found elif docs_in_attributes: @@ -152,8 +141,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr): else: hypotheses = [] logger.info(f"generated hypotheses: {hypotheses}") - # related_files will be removed in next pr - bot_attrs = {"related_files": related_files} for hyp in hypotheses: if prompt_type == "set_personal_tasks_into_tracker": @@ -173,7 +160,7 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr): confidence = LOW_CONFIDENCE else: confidence = DEFAULT_CONFIDENCE - gathering_responses(hyp, confidence, {}, bot_attrs, _curr_attrs) + gathering_responses(hyp, confidence, {}, {}, _curr_attrs) if len(curr_responses) == 0: return "" diff --git a/skills/dff_meeting_analysis_skill/scenario/utils.py b/skills/dff_meeting_analysis_skill/scenario/utils.py index 05767cc30..a3c9c8b08 100644 --- a/skills/dff_meeting_analysis_skill/scenario/utils.py +++ b/skills/dff_meeting_analysis_skill/scenario/utils.py @@ -73,14 +73,20 @@ management_prompts_dict[key]["prompt_concatenate"] = prompt_dict["prompt_concatenate"] -def get_older_gen_response(item_type_and_id, bot_attrs_files): - hypothesis_link = bot_attrs_files[item_type_and_id] - old_response = requests.get(hypothesis_link, timeout=FILE_SERVER_TIMEOUT).text - logger.info(f"Found and downloaded {item_type_and_id} generated earlier.") +def search_for_prev_gen_response(item_type_and_id: str) -> str: + link_to_check = f"{FILE_SERVER_URL}/file?file={item_type_and_id}.txt" + old_response = requests.get(link_to_check, timeout=FILE_SERVER_TIMEOUT) + if old_response.ok: + old_response = old_response.text + logger.info(f"Found and downloaded {item_type_and_id} generated earlier.") + else: + old_response = None + logger.info(f"No earlier {item_type_and_id} found. Ignore the warning above, no error occured.") + return old_response -def set_correct_type_and_id(request, prompt_type_local, document_in_use_id=None): +def set_correct_type_and_id(request: str, prompt_type_local: str, document_in_use_id=None) -> Tuple[str, str]: prompt_type_and_id = "" # for this response function, set_personal_tasks_into_tracker is equivalent to personal_future_tasks @@ -182,13 +188,13 @@ def get_response_for_prompt_type( return all_gpt_responses[-1] -def upload_generated_item_return_link(hypothesis: str, prompt_type_and_id: str): +def upload_generated_item_return_link(hypothesis: str, prompt_type_and_id: str) -> str: # we do not upload question_answering as questions may vary # we do not upload combine_responses because combine_responses type is only used for internal processing # the response generated in combine_responses will be uploaded later at its original name uploaded_doc_link = "" if "combine_responses" not in prompt_type_and_id and "question_answering" not in prompt_type_and_id: - logger.info(f"Saving {prompt_type_and_id} to related_files.") + logger.info(f"Uploading {prompt_type_and_id} to server.") filename = f"{prompt_type_and_id}.txt" uploaded_doc_link = upload_document(hypothesis, filename, FILE_SERVER_URL, FILE_SERVER_TIMEOUT, type_ref="text") return uploaded_doc_link @@ -199,10 +205,9 @@ def compose_and_upload_final_response( prompt_type_and_id: str, dialog_context: List[str], sending_variables: dict, - bot_attrs_files: dict, use_filenames: bool = True, username: str = None, -) -> Tuple[List[str], dict]: +) -> List[str]: # note that we are joining responses for all docs by a special character SEP_FOR_DOC_RESPONSES # when we are sending them to LLM, if we need to split the info into chunks, we # will do that by SEP_FOR_DOC_RESPONSES, not by newline @@ -216,18 +221,15 @@ def compose_and_upload_final_response( prompt_type_and_id_for_processing = f"combine_responses__{prompt_type_and_id.split('__')[1]}" else: prompt_type_and_id_for_processing = prompt_type_and_id - hyp_combined, bot_attrs_files = get_and_upload_response_for_one_doc( + hyp_combined = get_and_upload_response_for_one_doc( hyps_from_all_docs, prompt_type_and_id_for_processing, dialog_context, sending_variables, - bot_attrs_files, username, ) - uploaded_doc_link = upload_generated_item_return_link(hyp_combined, prompt_type_and_id) - if uploaded_doc_link: - bot_attrs_files[prompt_type_and_id] = uploaded_doc_link - return [hyp_combined], bot_attrs_files + upload_generated_item_return_link(hyp_combined, prompt_type_and_id) + return [hyp_combined] def get_and_upload_response_for_one_doc( @@ -235,9 +237,8 @@ def get_and_upload_response_for_one_doc( prompt_type_and_id: str, dialog_context: List[str], sending_variables: dict, - bot_attrs_files: dict, username: str = None, -) -> Tuple[str, dict]: +) -> str: prompt_type = prompt_type_and_id.split("__")[0] document_in_use_id = prompt_type_and_id.split("__")[1] # hard-coded limit: we preserve 1000 tokens for LLM answer @@ -256,11 +257,10 @@ def get_and_upload_response_for_one_doc( for item in INCLUDE_INTO_REPORT: item_type_and_id = f"{item}__{document_in_use_id}" - if item_type_and_id in bot_attrs_files.keys(): - part_of_report = get_older_gen_response(item_type_and_id, bot_attrs_files) + part_of_report = search_for_prev_gen_response(item_type_and_id) + if part_of_report: hypothesis += f"{part_of_report}\n\n" else: - logger.info(f"No earlier {item_type_and_id} for full_report found.") part_of_report = get_response_for_prompt_type( transcript_chunks=transcript_chunks, prompt_type=item, @@ -269,9 +269,7 @@ def get_and_upload_response_for_one_doc( username=username, format_the_response=True, ) - uploaded_doc_link = upload_generated_item_return_link(part_of_report, item_type_and_id) - if uploaded_doc_link: - bot_attrs_files[item_type_and_id] = uploaded_doc_link + upload_generated_item_return_link(part_of_report, item_type_and_id) hypothesis += f"{part_of_report}\n\n" hypothesis = hypothesis.strip() else: @@ -285,10 +283,8 @@ def get_and_upload_response_for_one_doc( ) # we save each hyp to server under the name of the request and doc_in_use id - uploaded_doc_link = upload_generated_item_return_link(hypothesis, prompt_type_and_id) - if uploaded_doc_link: - bot_attrs_files[prompt_type_and_id] = uploaded_doc_link - return hypothesis, bot_attrs_files + upload_generated_item_return_link(hypothesis, prompt_type_and_id) + return hypothesis def get_name_and_text_from_file(transcript_link: str) -> Tuple[str, str]: @@ -316,16 +312,8 @@ def get_username(last_human_uttr: dict) -> str: return username -def search_for_gen_response_prompt_type(prompt_type_and_id: str, related_files: dict) -> List[str]: - older_gen_response = None - if prompt_type_and_id in related_files.keys(): - older_gen_response = [get_older_gen_response(prompt_type_and_id, related_files)] - - return older_gen_response - - def older_gen_response_for_request( - request: str, prompt_type_local: str, documents_in_use: List[str], docs_combination_ids: dict, related_files: dict + request: str, prompt_type_local: str, documents_in_use: List[str], docs_combination_ids: dict ) -> List[str]: older_gen_response = None # check if have final hypothesis for this request in case of multiple docs in use @@ -333,13 +321,8 @@ def older_gen_response_for_request( prompt_type_local, _ = set_correct_type_and_id(request, prompt_type_local) curr_combination_id = get_key_by_value(docs_combination_ids, documents_in_use) prompt_type_and_id = f"{prompt_type_local}__{curr_combination_id}" - older_gen_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files) - # check if have final hypothesis for this request in case of one doc in use - else: - prompt_type_local, prompt_type_and_id = set_correct_type_and_id( - request, prompt_type_local, document_in_use_id=documents_in_use[0] - ) - older_gen_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files) + older_gen_response = [search_for_prev_gen_response(prompt_type_and_id)] + # in case we have one doc_in_use, it will be checked later anyway return older_gen_response @@ -351,7 +334,6 @@ def postprocess_hyps_from_all_docs( dialog_context: List[str], sending_variables: dict, username: str, - related_files: dict, ) -> Tuple[List[str], dict]: if len(hyps_and_names_all_docs) == 1 and prompt_type_local != "weekly_report": hypotheses_init = [hyps_and_names_all_docs[0][1]] @@ -364,19 +346,18 @@ def postprocess_hyps_from_all_docs( # now by default we are passing filenames to LLM together with hypothesis for each file # you can choose to pass only hypotheses (no filenames) by setting use_filenames=False # when calling compose_and_upload_final_response() - hypotheses_init, related_files = compose_and_upload_final_response( + hypotheses_init = compose_and_upload_final_response( hyps_and_names_all_docs=hyps_and_names_all_docs, prompt_type_and_id=prompt_type_and_id, dialog_context=dialog_context, sending_variables=sending_variables, - related_files=related_files, username=username, ) except Exception as e: sentry_sdk.capture_exception(e) logger.exception(e) hypotheses_init = [] - return hypotheses_init, related_files + return hypotheses_init def get_hyp_and_filename_for_one_doc( @@ -387,8 +368,7 @@ def get_hyp_and_filename_for_one_doc( dialog_context: List[str], sending_variables: dict, username: str, - related_files: dict, -) -> Tuple[str, Tuple[str, str], dict]: +) -> Tuple[str, Tuple[str, str]]: # if we need a weekly report, on this step we gather separate daily reports for each doc # also here we change the type of summary prompt based on summary length request prompt_type_local, prompt_type_and_id = set_correct_type_and_id( @@ -398,23 +378,21 @@ def get_hyp_and_filename_for_one_doc( transcript_link = all_docs_info[document_in_use_id].get("processed_text_link", "") if transcript_link: # here we check if we already generated sth for the same request and the same doc - older_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files) - if older_response: - # in the future, it is better to store filenames in related_files + prev_response = search_for_prev_gen_response(prompt_type_and_id) + if prev_response: + # in the future, it is better to store filenames in somewhere TODO # to avoid extra requests to file server filename, _ = get_name_and_text_from_file(transcript_link) - hyp_and_name_one_doc = [(filename, older_response)] + hyp_and_name_one_doc = [(filename, prev_response)] # if no, let's generate it else: - logger.info(f"No earlier {prompt_type_and_id} found. Sending request to generative model.") try: filename, orig_text = get_name_and_text_from_file(transcript_link) - hyp_one_doc, related_files = get_and_upload_response_for_one_doc( + hyp_one_doc = get_and_upload_response_for_one_doc( orig_text=orig_text, prompt_type_and_id=prompt_type_and_id, dialog_context=dialog_context, sending_variables=sending_variables, - bot_attrs_files=related_files, username=username, ) hyp_and_name_one_doc = [(filename, hyp_one_doc)] @@ -424,4 +402,4 @@ def get_hyp_and_filename_for_one_doc( hyp_and_name_one_doc = [] else: hyp_and_name_one_doc = [] - return prompt_type_local, hyp_and_name_one_doc, related_files + return prompt_type_local, hyp_and_name_one_doc