Skip to content

Commit

Permalink
feat(config): add new pipeline and task configuration for dataset sim…
Browse files Browse the repository at this point in the history
…ulation
  • Loading branch information
entelecheia committed Jul 27, 2023
1 parent e0525ca commit 3710f93
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 0 deletions.
41 changes: 41 additions & 0 deletions config/pipeline/datasets-sim.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
defaults:
- datasets

use_task_as_initial_object: true
steps:
- uses: pipe_load
with:
dataset_path: datasets/processed/kakao_nouns
verbose: true
# - uses: pipe_sample
# with:
# num_samples: 1000
# randomize: true
# verbose: true
- uses: pipe_to_pandas
verbose: true
- uses: pipe_head
verbose: true
- uses: pipe_find_similar
with:
num_workers: 100
min_num_docs: 3
percentile: 90
distance_threshold: 0.3
linkage: average
grouping_freq: W
grouping_name: Week
date_col: createdDt
token_col: nouns
id_col: newsId
ordering_col: createdDt_int
duplicate_col: duplicate
fig_col: fig_filename
output_dir: outputs/figures/kakao_similar_7
show_fig: false
save_fig: true
verbose: true
verbose: true
- uses: pipe_save_df
with:
data_file: datasets/processed/kakao_nouns_similar_7.parquet
3 changes: 3 additions & 0 deletions config/task/datasets-sim.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
defaults:
- datasets
- override /pipeline@ds_pipeline: datasets-sim

0 comments on commit 3710f93

Please sign in to comment.