Skip to content

Commit

Permalink
feat(tokenize): add extract_tokens function to handle part-of-speech …
Browse files Browse the repository at this point in the history
…tagging
  • Loading branch information
entelecheia committed Jul 25, 2023
1 parent 5ddf017 commit c6482fb
Showing 1 changed file with 46 additions and 0 deletions.
46 changes: 46 additions & 0 deletions src/corprep/datasets/tokenize.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List, Optional

from datasets import Dataset

from corprep import HyFI
Expand Down Expand Up @@ -36,3 +38,47 @@ def pos_tagging(batch):
if verbose:
print(data[0][token_col])
return data


def extract_tokens(
data: Dataset,
tokenizer_config_name: str = "simple",
num_proc: int = 1,
batched: bool = True,
token_col: str = "tokenizedText",
nouns_only: bool = False,
postags: Optional[List[str]] = None,
stop_postags: Optional[List[str]] = None,
strip_pos: Optional[bool] = None,
postag_delim: Optional[str] = None,
postag_length: Optional[int] = None,
load_from_cache_file: bool = True,
verbose: bool = False,
) -> Dataset:
def pos_tagging(batch):
tokenizer = HyFI.instantiate_config(f"tokenizer={tokenizer_config_name}")
batch_tokens = []
for tokens in batch[token_col]:
batch_tokens.append(
tokenizer.extract(
tokens,
nouns_only=nouns_only,
postags=postags,
stop_postags=stop_postags,
strip_pos=strip_pos,
postag_delim=postag_delim,
postag_length=postag_length,
)
)
return {token_col: batch_tokens}

data = data.map(
pos_tagging,
num_proc=num_proc,
batched=batched,
load_from_cache_file=load_from_cache_file,
)
logger.info("POS tagging done.")
if verbose:
print(data[0][token_col])
return data

0 comments on commit c6482fb

Please sign in to comment.