Skip to content

Commit

Permalink
feat(corprep/datasets/preprocessing): add tokenize_dataset function
Browse files Browse the repository at this point in the history
  • Loading branch information
entelecheia committed Jul 17, 2023
1 parent beffddc commit 2db1824
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 0 deletions.
Empty file.
30 changes: 30 additions & 0 deletions src/corprep/datasets/preprocessing/tokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from datasets import Dataset
from ekonlpy.tag import Mecab # type: ignore
from corprep import HyFI # type: ignore

logger = HyFI.getLogger(__name__)


def tokenize_dataset(
data: Dataset,
num_proc: int = 1,
batched: bool = True,
text_col: str = "bodyText",
verbose: bool = False,
) -> Dataset:
def pos_tagging(batch):
mecab = Mecab()
batch_tags = []
for text in batch[text_col]:
sentences = text.split("\n")
pos_tags = []
for sentence in sentences:
pos_tags.extend(mecab.pos(sentence))
batch_tags.append(pos_tags)
return {"pos_tags": batch_tags}

data = data.map(pos_tagging, num_proc=num_proc, batched=batched)
logger.info("POS tagging done.")
if verbose:
print(data[0]["pos_tags"])
return data

0 comments on commit 2db1824

Please sign in to comment.