Skip to content

Commit

Permalink
fix(pipeline): increase sample size and adjust worker count, remove s…
Browse files Browse the repository at this point in the history
…pecific columns from removal list
  • Loading branch information
entelecheia committed Aug 4, 2023
1 parent f242a63 commit 28529da
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions config/pipeline/datasets-noun.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,26 @@ steps:
# with:
# dataset_path: datasets/processed/kakao_filtered
# verbose: true
# - uses: pipe_sample
# with:
# sample_size: 15
# randomize: true
# verbose: true
- uses: pipe_sample
with:
sample_size: 1000
randomize: true
verbose: true
- uses: pipe_tokenize
with:
tokenizer_config_name: mecab
# batch_size: 3
num_workers: 30
text_col: cleaned_text
token_col: tokenizedText
remove_columns: [bodyText, cleaned_text, analyse_text]
remove_columns: [bodyText, analyse_text]
load_from_cache_file: false
verbose: true
- uses: pipe_extract
with:
tokenizer_config_name: kakao
# batch_size: 3
num_workers: 50
num_workers: 30
token_col: tokenizedText
extracted_col: nouns
nouns_only: true
Expand Down

0 comments on commit 28529da

Please sign in to comment.