fix(pipeline): increase sample size and adjust worker count, remove s…

…pecific columns from removal list
entelecheia · Aug 4, 2023 · 28529da · 28529da
1 parent f242a63
commit 28529da
Showing 1 changed file with 7 additions and 7 deletions.
diff --git a/config/pipeline/datasets-noun.yaml b/config/pipeline/datasets-noun.yaml
@@ -13,26 +13,26 @@ steps:
   #   with:
   #     dataset_path: datasets/processed/kakao_filtered
   #     verbose: true
-  # - uses: pipe_sample
-  #   with:
-  #     sample_size: 15
-  #     randomize: true
-  #     verbose: true
+  - uses: pipe_sample
+    with:
+      sample_size: 1000
+      randomize: true
+      verbose: true
   - uses: pipe_tokenize
     with:
       tokenizer_config_name: mecab
       # batch_size: 3
       num_workers: 30
       text_col: cleaned_text
       token_col: tokenizedText
-      remove_columns: [bodyText, cleaned_text, analyse_text]
+      remove_columns: [bodyText, analyse_text]
       load_from_cache_file: false
       verbose: true
   - uses: pipe_extract
     with:
       tokenizer_config_name: kakao
       # batch_size: 3
-      num_workers: 50
+      num_workers: 30
       token_col: tokenizedText
       extracted_col: nouns
       nouns_only: true