From c4685976e225967ff7cec3e24820ab0d18a912d9 Mon Sep 17 00:00:00 2001 From: Young Joon Lee Date: Thu, 27 Jul 2023 17:24:04 +0900 Subject: [PATCH] feat(corprep): add find_similar_docs configuration --- src/corprep/conf/pipe/find_similar_docs.yaml | 6 ++++++ src/corprep/conf/run/find_similar_docs.yaml | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 src/corprep/conf/pipe/find_similar_docs.yaml create mode 100644 src/corprep/conf/run/find_similar_docs.yaml diff --git a/src/corprep/conf/pipe/find_similar_docs.yaml b/src/corprep/conf/pipe/find_similar_docs.yaml new file mode 100644 index 0000000..5c78b09 --- /dev/null +++ b/src/corprep/conf/pipe/find_similar_docs.yaml @@ -0,0 +1,6 @@ +defaults: + - __general_external_funcs__ + - /run: find_similar_docs + +use_pipe_obj: true +return_pipe_obj: false diff --git a/src/corprep/conf/run/find_similar_docs.yaml b/src/corprep/conf/run/find_similar_docs.yaml new file mode 100644 index 0000000..2f74b23 --- /dev/null +++ b/src/corprep/conf/run/find_similar_docs.yaml @@ -0,0 +1,18 @@ +_target_: corprep.datasets.similarity.find_similar_docs +num_workers: 2 +min_num_docs: 5 +percentile: 80 +distance_threshold: +linkage: average +grouping_freq: W +grouping_name: Week +date_col: createdDt +token_col: nouns +id_col: newsId +ordering_col: createdDt_int +duplicate_col: duplicate +fig_col: fig_filename +output_dir: . +show_fig: false +save_fig: false +verbose: false