Skip to content

Commit

Permalink
feat(datasets): add function to save raw datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
entelecheia committed Jul 15, 2023
1 parent b6b60fa commit 8a5319c
Showing 1 changed file with 33 additions and 0 deletions.
33 changes: 33 additions & 0 deletions src/corprep/datasets/raw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from pathlib import Path
from typing import Union

from corprep import HyFI # type: ignore

logger = HyFI.getLogger(__name__)


def save_raw_dataset(
raw_dataset_dir: Union[str, Path],
dataset_path: Union[str, Path],
verbose: bool = False,
**kwargs,
):
raw_dataset_dir = Path(raw_dataset_dir)
raw_data_files = []
if raw_dataset_dir.exists():
raw_data_files = HyFI.get_filepaths(f"{raw_dataset_dir}/*.dat")
logger.info("Found %d raw data files.", len(raw_data_files))
else:
logger.warning("No raw data files found.")
raise FileNotFoundError()

dataset = HyFI.load_dataset("json", data_files=raw_data_files)
ds_train: Dataset = dataset["train"] # type: ignore

logger.info("Number of training samples: %d", len(ds_train))
logger.info("Dataset features: %s", ds_train.features)

# Save the processed dataset to disk
ds_train.save_to_disk(dataset_path)
logger.info("Saved the processed dataset to %s", dataset_path)
return ds_train

0 comments on commit 8a5319c

Please sign in to comment.