Skip to content

Commit

Permalink
feat(pipeline): add new dataset filter and load steps
Browse files Browse the repository at this point in the history
  • Loading branch information
entelecheia committed Aug 3, 2023
1 parent 6bbd818 commit 441e063
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 0 deletions.
21 changes: 21 additions & 0 deletions config/pipeline/datasets-filter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
defaults:
- datasets

use_task_as_initial_object: true
steps:
- uses: pipe_load_dfs
with:
data_files: datasets/processed/kakao_nouns_similar_6.parquet
split: train
verbose: true
- uses: pipe_head
verbose: true
- uses: pipe_filter
with:
queries:
- "duplicate == False"
- "cleaned_text.str.split().str.len() > 15"
sample_size: 100
sample_seed: 123
output_dir: datasets/filtered/kakao_deduped
verbose: true
2 changes: 2 additions & 0 deletions config/pipeline/datasets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ defaults:
- /pipe@pipe_head: pandas_print_head
- /pipe@pipe_save_df: save_dataframes
- /pipe@pipe_find_similar: find_similar_docs_ac
- /pipe@pipe_filter: filter_dataset
- /pipe@pipe_load_dfs: load_dataframes

use_task_as_initial_object: true
steps:

0 comments on commit 441e063

Please sign in to comment.