Skip to content

Commit

Permalink
Fix/get rid of related files (#152)
Browse files Browse the repository at this point in the history
* check if file with gen text is present on server instead of writing links to bot_attrs

* old response = None if there is no file on server
  • Loading branch information
smilni committed Dec 7, 2023
1 parent 4436e8c commit 1cd4b66
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 73 deletions.
19 changes: 3 additions & 16 deletions skills/dff_meeting_analysis_skill/scenario/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):

dialog = int_ctx.get_dialog(ctx, actor)
context = dialog.get("utterances", [])[-N_UTTERANCES_CONTEXT:]
related_files = {}
prompt_type_local = prompt_type

if context:
Expand All @@ -99,13 +98,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
documents_in_use = user_attributes.get("documents_in_use", [])
docs_combination_ids = user_attributes.get("documents_combination_ids", {})
all_docs_info = user_attributes.get("processed_documents", {})
# related_files will be removed in next pr
related_files = (
int_ctx.get_last_bot_utterance(ctx, actor)
.get("user", {})
.get("attributes", {})
.get("related_files", {})
)
hyps_and_names_all_docs = []
if documents_in_use:
# try to find hypothesis generated for this request & these files earlier
Expand All @@ -114,34 +106,31 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
prompt_type_local=prompt_type_local,
documents_in_use=documents_in_use,
docs_combination_ids=docs_combination_ids,
related_files=related_files,
)
# if no final hypothesis for this request was found
if not hypotheses:
# for each doc_in_use get a separate hypothesis
for document_in_use_id in documents_in_use:
prompt_type_local, hyp_and_name_one_doc, related_files = get_hyp_and_filename_for_one_doc(
prompt_type_local, hyp_and_name_one_doc = get_hyp_and_filename_for_one_doc(
request=request,
prompt_type_local=prompt_type_local,
all_docs_info=all_docs_info,
document_in_use_id=document_in_use_id,
dialog_context=dialog_context,
sending_variables=sending_variables,
username=username,
related_files=related_files,
)
hyps_and_names_all_docs += hyp_and_name_one_doc
# having got responses for all docs, let's ask the model to generate one response from it
# or return the final response if we have one document and one response
hypotheses, related_files = postprocess_hyps_from_all_docs(
hypotheses = postprocess_hyps_from_all_docs(
hyps_and_names_all_docs=hyps_and_names_all_docs,
prompt_type_local=prompt_type_local,
docs_combination_ids=docs_combination_ids,
documents_in_use=documents_in_use,
dialog_context=dialog_context,
sending_variables=sending_variables,
username=username,
related_files=related_files,
)
# if there are docs in human utt attributes, but no processed docs in use were found
elif docs_in_attributes:
Expand All @@ -152,8 +141,6 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
else:
hypotheses = []
logger.info(f"generated hypotheses: {hypotheses}")
# related_files will be removed in next pr
bot_attrs = {"related_files": related_files}

for hyp in hypotheses:
if prompt_type == "set_personal_tasks_into_tracker":
Expand All @@ -173,7 +160,7 @@ def gathering_responses(reply, confidence, human_attr, bot_attr, attr):
confidence = LOW_CONFIDENCE
else:
confidence = DEFAULT_CONFIDENCE
gathering_responses(hyp, confidence, {}, bot_attrs, _curr_attrs)
gathering_responses(hyp, confidence, {}, {}, _curr_attrs)

if len(curr_responses) == 0:
return ""
Expand Down
92 changes: 35 additions & 57 deletions skills/dff_meeting_analysis_skill/scenario/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,20 @@
management_prompts_dict[key]["prompt_concatenate"] = prompt_dict["prompt_concatenate"]


def get_older_gen_response(item_type_and_id, bot_attrs_files):
hypothesis_link = bot_attrs_files[item_type_and_id]
old_response = requests.get(hypothesis_link, timeout=FILE_SERVER_TIMEOUT).text
logger.info(f"Found and downloaded {item_type_and_id} generated earlier.")
def search_for_prev_gen_response(item_type_and_id: str) -> str:
link_to_check = f"{FILE_SERVER_URL}/file?file={item_type_and_id}.txt"
old_response = requests.get(link_to_check, timeout=FILE_SERVER_TIMEOUT)
if old_response.ok:
old_response = old_response.text
logger.info(f"Found and downloaded {item_type_and_id} generated earlier.")
else:
old_response = None
logger.info(f"No earlier {item_type_and_id} found. Ignore the warning above, no error occured.")

return old_response


def set_correct_type_and_id(request, prompt_type_local, document_in_use_id=None):
def set_correct_type_and_id(request: str, prompt_type_local: str, document_in_use_id=None) -> Tuple[str, str]:
prompt_type_and_id = ""

# for this response function, set_personal_tasks_into_tracker is equivalent to personal_future_tasks
Expand Down Expand Up @@ -182,13 +188,13 @@ def get_response_for_prompt_type(
return all_gpt_responses[-1]


def upload_generated_item_return_link(hypothesis: str, prompt_type_and_id: str):
def upload_generated_item_return_link(hypothesis: str, prompt_type_and_id: str) -> str:
# we do not upload question_answering as questions may vary
# we do not upload combine_responses because combine_responses type is only used for internal processing
# the response generated in combine_responses will be uploaded later at its original name
uploaded_doc_link = ""
if "combine_responses" not in prompt_type_and_id and "question_answering" not in prompt_type_and_id:
logger.info(f"Saving {prompt_type_and_id} to related_files.")
logger.info(f"Uploading {prompt_type_and_id} to server.")
filename = f"{prompt_type_and_id}.txt"
uploaded_doc_link = upload_document(hypothesis, filename, FILE_SERVER_URL, FILE_SERVER_TIMEOUT, type_ref="text")
return uploaded_doc_link
Expand All @@ -199,10 +205,9 @@ def compose_and_upload_final_response(
prompt_type_and_id: str,
dialog_context: List[str],
sending_variables: dict,
bot_attrs_files: dict,
use_filenames: bool = True,
username: str = None,
) -> Tuple[List[str], dict]:
) -> List[str]:
# note that we are joining responses for all docs by a special character SEP_FOR_DOC_RESPONSES
# when we are sending them to LLM, if we need to split the info into chunks, we
# will do that by SEP_FOR_DOC_RESPONSES, not by newline
Expand All @@ -216,28 +221,24 @@ def compose_and_upload_final_response(
prompt_type_and_id_for_processing = f"combine_responses__{prompt_type_and_id.split('__')[1]}"
else:
prompt_type_and_id_for_processing = prompt_type_and_id
hyp_combined, bot_attrs_files = get_and_upload_response_for_one_doc(
hyp_combined = get_and_upload_response_for_one_doc(
hyps_from_all_docs,
prompt_type_and_id_for_processing,
dialog_context,
sending_variables,
bot_attrs_files,
username,
)
uploaded_doc_link = upload_generated_item_return_link(hyp_combined, prompt_type_and_id)
if uploaded_doc_link:
bot_attrs_files[prompt_type_and_id] = uploaded_doc_link
return [hyp_combined], bot_attrs_files
upload_generated_item_return_link(hyp_combined, prompt_type_and_id)
return [hyp_combined]


def get_and_upload_response_for_one_doc(
orig_text: str,
prompt_type_and_id: str,
dialog_context: List[str],
sending_variables: dict,
bot_attrs_files: dict,
username: str = None,
) -> Tuple[str, dict]:
) -> str:
prompt_type = prompt_type_and_id.split("__")[0]
document_in_use_id = prompt_type_and_id.split("__")[1]
# hard-coded limit: we preserve 1000 tokens for LLM answer
Expand All @@ -256,11 +257,10 @@ def get_and_upload_response_for_one_doc(

for item in INCLUDE_INTO_REPORT:
item_type_and_id = f"{item}__{document_in_use_id}"
if item_type_and_id in bot_attrs_files.keys():
part_of_report = get_older_gen_response(item_type_and_id, bot_attrs_files)
part_of_report = search_for_prev_gen_response(item_type_and_id)
if part_of_report:
hypothesis += f"{part_of_report}\n\n"
else:
logger.info(f"No earlier {item_type_and_id} for full_report found.")
part_of_report = get_response_for_prompt_type(
transcript_chunks=transcript_chunks,
prompt_type=item,
Expand All @@ -269,9 +269,7 @@ def get_and_upload_response_for_one_doc(
username=username,
format_the_response=True,
)
uploaded_doc_link = upload_generated_item_return_link(part_of_report, item_type_and_id)
if uploaded_doc_link:
bot_attrs_files[item_type_and_id] = uploaded_doc_link
upload_generated_item_return_link(part_of_report, item_type_and_id)
hypothesis += f"{part_of_report}\n\n"
hypothesis = hypothesis.strip()
else:
Expand All @@ -285,10 +283,8 @@ def get_and_upload_response_for_one_doc(
)

# we save each hyp to server under the name of the request and doc_in_use id
uploaded_doc_link = upload_generated_item_return_link(hypothesis, prompt_type_and_id)
if uploaded_doc_link:
bot_attrs_files[prompt_type_and_id] = uploaded_doc_link
return hypothesis, bot_attrs_files
upload_generated_item_return_link(hypothesis, prompt_type_and_id)
return hypothesis


def get_name_and_text_from_file(transcript_link: str) -> Tuple[str, str]:
Expand Down Expand Up @@ -316,30 +312,17 @@ def get_username(last_human_uttr: dict) -> str:
return username


def search_for_gen_response_prompt_type(prompt_type_and_id: str, related_files: dict) -> List[str]:
older_gen_response = None
if prompt_type_and_id in related_files.keys():
older_gen_response = [get_older_gen_response(prompt_type_and_id, related_files)]

return older_gen_response


def older_gen_response_for_request(
request: str, prompt_type_local: str, documents_in_use: List[str], docs_combination_ids: dict, related_files: dict
request: str, prompt_type_local: str, documents_in_use: List[str], docs_combination_ids: dict
) -> List[str]:
older_gen_response = None
# check if have final hypothesis for this request in case of multiple docs in use
if len(documents_in_use) > 1:
prompt_type_local, _ = set_correct_type_and_id(request, prompt_type_local)
curr_combination_id = get_key_by_value(docs_combination_ids, documents_in_use)
prompt_type_and_id = f"{prompt_type_local}__{curr_combination_id}"
older_gen_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files)
# check if have final hypothesis for this request in case of one doc in use
else:
prompt_type_local, prompt_type_and_id = set_correct_type_and_id(
request, prompt_type_local, document_in_use_id=documents_in_use[0]
)
older_gen_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files)
older_gen_response = [search_for_prev_gen_response(prompt_type_and_id)]
# in case we have one doc_in_use, it will be checked later anyway
return older_gen_response


Expand All @@ -351,7 +334,6 @@ def postprocess_hyps_from_all_docs(
dialog_context: List[str],
sending_variables: dict,
username: str,
related_files: dict,
) -> Tuple[List[str], dict]:
if len(hyps_and_names_all_docs) == 1 and prompt_type_local != "weekly_report":
hypotheses_init = [hyps_and_names_all_docs[0][1]]
Expand All @@ -364,19 +346,18 @@ def postprocess_hyps_from_all_docs(
# now by default we are passing filenames to LLM together with hypothesis for each file
# you can choose to pass only hypotheses (no filenames) by setting use_filenames=False
# when calling compose_and_upload_final_response()
hypotheses_init, related_files = compose_and_upload_final_response(
hypotheses_init = compose_and_upload_final_response(
hyps_and_names_all_docs=hyps_and_names_all_docs,
prompt_type_and_id=prompt_type_and_id,
dialog_context=dialog_context,
sending_variables=sending_variables,
related_files=related_files,
username=username,
)
except Exception as e:
sentry_sdk.capture_exception(e)
logger.exception(e)
hypotheses_init = []
return hypotheses_init, related_files
return hypotheses_init


def get_hyp_and_filename_for_one_doc(
Expand All @@ -387,8 +368,7 @@ def get_hyp_and_filename_for_one_doc(
dialog_context: List[str],
sending_variables: dict,
username: str,
related_files: dict,
) -> Tuple[str, Tuple[str, str], dict]:
) -> Tuple[str, Tuple[str, str]]:
# if we need a weekly report, on this step we gather separate daily reports for each doc
# also here we change the type of summary prompt based on summary length request
prompt_type_local, prompt_type_and_id = set_correct_type_and_id(
Expand All @@ -398,23 +378,21 @@ def get_hyp_and_filename_for_one_doc(
transcript_link = all_docs_info[document_in_use_id].get("processed_text_link", "")
if transcript_link:
# here we check if we already generated sth for the same request and the same doc
older_response = search_for_gen_response_prompt_type(prompt_type_and_id, related_files)
if older_response:
# in the future, it is better to store filenames in related_files
prev_response = search_for_prev_gen_response(prompt_type_and_id)
if prev_response:
# in the future, it is better to store filenames in somewhere TODO
# to avoid extra requests to file server
filename, _ = get_name_and_text_from_file(transcript_link)
hyp_and_name_one_doc = [(filename, older_response)]
hyp_and_name_one_doc = [(filename, prev_response)]
# if no, let's generate it
else:
logger.info(f"No earlier {prompt_type_and_id} found. Sending request to generative model.")
try:
filename, orig_text = get_name_and_text_from_file(transcript_link)
hyp_one_doc, related_files = get_and_upload_response_for_one_doc(
hyp_one_doc = get_and_upload_response_for_one_doc(
orig_text=orig_text,
prompt_type_and_id=prompt_type_and_id,
dialog_context=dialog_context,
sending_variables=sending_variables,
bot_attrs_files=related_files,
username=username,
)
hyp_and_name_one_doc = [(filename, hyp_one_doc)]
Expand All @@ -424,4 +402,4 @@ def get_hyp_and_filename_for_one_doc(
hyp_and_name_one_doc = []
else:
hyp_and_name_one_doc = []
return prompt_type_local, hyp_and_name_one_doc, related_files
return prompt_type_local, hyp_and_name_one_doc

0 comments on commit 1cd4b66

Please sign in to comment.