feat(corprep): add save_dataset pipe

refactor(corprep): rename save_raw_dataset function to load_raw_dataset refactor(corprep): split dataset saving into new pipe chore(corprep): rename save_raw_dataset to load_raw_dataset in yaml configs
entelecheia · Jul 17, 2023 · b7a3dff · b7a3dff
1 parent ffe0c12
commit b7a3dff
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 8 deletions.
diff --git a/src/corprep/conf/pipe/save_raw_dataset.yaml → src/corprep/conf/pipe/load_raw_dataset.yaml b/src/corprep/conf/pipe/save_raw_dataset.yaml → src/corprep/conf/pipe/load_raw_dataset.yaml
@@ -1,5 +1,5 @@
 defaults:
   - __general_external_funcs__
 
-run: corprep.datasets.raw.save_raw_dataset
+run: corprep.datasets.raw.load_raw_dataset
 use_pipe_obj: false
diff --git a/src/corprep/conf/pipe/save_dataset.yaml b/src/corprep/conf/pipe/save_dataset.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - __general_instance_methods__
+
+run: save_to_disk
+run_with:
+  dataset_path:
diff --git a/src/corprep/conf/pipeline/datasets.yaml b/src/corprep/conf/pipeline/datasets.yaml
@@ -1,12 +1,16 @@
 defaults:
   - __init__
-  - /pipe@pipe1: save_raw_dataset
+  - /pipe@pipe1: load_raw_dataset
+  - /pipe@pipe2: save_dataset
 
 use_task_as_initial_object: true
 steps:
   - uses: pipe1
     with:
       raw_dataset_dir: ${__home_path__:}/workspace/datasets/raw/daum_news_20230707
-      dataset_path: datasets/processed/daum_news_20230707
       verbose: true
     verbose: true
+  - uses: pipe2
+    with:
+      dataset_path: datasets/processed/daum_news_20230707
+    verbose: true
diff --git a/src/corprep/datasets/raw.py b/src/corprep/datasets/raw.py
@@ -8,9 +8,8 @@
 logger = HyFI.getLogger(__name__)
 
 
-def save_raw_dataset(
+def load_raw_dataset(
     raw_dataset_dir: Union[str, Path],
-    dataset_path: Union[str, Path],
     verbose: bool = False,
     **kwargs,
 ):
@@ -29,7 +28,4 @@ def save_raw_dataset(
     logger.info("Number of training samples: %d", len(ds_train))
     logger.info("Dataset features: %s", ds_train.features)
 
-    # Save the processed dataset to disk
-    ds_train.save_to_disk(dataset_path)
-    logger.info("Saved the processed dataset to %s", dataset_path)
     return ds_train