Skip to content

Commit

Permalink
fix(datasets): add path and file_pattern parameters to load_raw_dataset
Browse files Browse the repository at this point in the history
chore(datasets): update logger info with more verbose options
fix(datasets): use user specified path and file_pattern in raw data file retrieval
refactor(datasets): enhance verbose logging in load_raw_dataset function
  • Loading branch information
entelecheia committed Jul 18, 2023
1 parent ca9ac2b commit 2bf977b
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions src/corprep/datasets/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,27 @@

def load_raw_dataset(
raw_dataset_dir: Union[str, Path],
path: str = "json",
file_pattern: str = "*.dat",
verbose: bool = False,
**kwargs,
):
raw_dataset_dir = Path(raw_dataset_dir)
raw_data_files = []
if raw_dataset_dir.exists():
raw_data_files = HyFI.get_filepaths(f"{raw_dataset_dir}/*.dat")
raw_data_files = HyFI.get_filepaths(f"{raw_dataset_dir}/{file_pattern}")
logger.info("Found %d raw data files.", len(raw_data_files))
else:
logger.warning("No raw data files found.")
raise FileNotFoundError()

dataset = HyFI.load_dataset("json", data_files=raw_data_files)
dataset = HyFI.load_dataset(path, data_files=raw_data_files)
ds_train: Dataset = dataset["train"] # type: ignore

logger.info("Number of training samples: %d", len(ds_train))
logger.info("Dataset features: %s", ds_train.features)
logger.info("Number of training samples: %s", len(ds_train))
if verbose:
print(ds_train[99])
print(ds_train[-99])
logger.info("Dataset features: %s", ds_train.features)

return ds_train

0 comments on commit 2bf977b

Please sign in to comment.