Skip to content

Commit

Permalink
feat(filter): add verbose print statements.
Browse files Browse the repository at this point in the history
  • Loading branch information
entelecheia committed Aug 3, 2023
1 parent ab84fcb commit 08cf03e
Showing 1 changed file with 8 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/corprep/datasets/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def filter_dataset(
data_file=f"{output_dir}/{sample_filename}",
verbose=verbose,
)
if verbose:
print(sample.head())

# Create a train set
train = data_[~data_.index.isin(sample.index)]
Expand All @@ -43,6 +45,8 @@ def filter_dataset(
data_file=f"{output_dir}/{train_filename}",
verbose=verbose,
)
if verbose:
print(train.head())

# Create a discard set
discard = data[~data.index.isin(train.index) & ~data.index.isin(sample.index)]
Expand All @@ -51,6 +55,8 @@ def filter_dataset(
data_file=f"{output_dir}/{discard_filename}",
verbose=verbose,
)
if verbose:
print(discard.head())
logger.info(
"Created %d samples, %d train samples, and %d discard samples",
sample.shape[0],
Expand Down Expand Up @@ -94,8 +100,10 @@ def filter_by_queries(
for qry in queries:
if verbose:
logger.info("filtering data by %s", qry)
print(f"fitering data by {qry}")
n_docs = data.shape[0]
data = data.query(qry, engine="python")
if verbose:
logger.info("filtered %d documents", n_docs - data.shape[0])
print(f"filtered {n_docs - data.shape[0]} documents")
return data

0 comments on commit 08cf03e

Please sign in to comment.