Skip to content

Commit

Permalink
feat(pipeline/config): enhance datasets.yaml
Browse files Browse the repository at this point in the history
  • Loading branch information
entelecheia committed Jul 25, 2023
1 parent 002dfff commit 28a6108
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
1 change: 0 additions & 1 deletion config/pipeline/datasets-test.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
defaults:
- datasets
- /pipe@pipe_sample: dataset_sample

use_task_as_initial_object: true
steps:
Expand Down
33 changes: 31 additions & 2 deletions config/pipeline/datasets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,46 @@ defaults:
- /pipe@pipe_load: dataset_load_from_disk
- /pipe@pipe_save: dataset_save_to_disk
- /pipe@pipe_tokenize: dataset_tokenize
- /pipe@pipe_sample: dataset_sample
- /pipe@pipe_extract: dataset_extract_tokens

use_task_as_initial_object: true
steps:
- uses: pipe_load_raw
with:
raw_dataset_dir: workspace/datasets/raw/kakao_news
raw_dataset_dir: datasets/processed/kakao_filtered
path: parquet
file_pattern: "*.parquet"
verbose: true
verbose: true
# - uses: pipe_load
# with:
# dataset_path: datasets/processed/kakao_filtered
# verbose: true
# - uses: pipe_sample
# with:
# num_samples: 15
# randomize: true
# verbose: true
- uses: pipe_tokenize
with:
tokenizer_config_name: mecab
# batch_size: 3
num_workers: 50
text_col: bodyText
token_col: tokenizedText
load_from_cache_file: false
verbose: true
- uses: pipe_extract
with:
tokenizer_config_name: kakao
# batch_size: 3
num_workers: 50
token_col: tokenizedText
nouns_only: true
load_from_cache_file: false
verbose: true
- uses: pipe_save
with:
dataset_path: datasets/processed/kakao
dataset_path: datasets/processed/kakao_nouns
verbose: true

0 comments on commit 28a6108

Please sign in to comment.