In [1]:
from altr.nlp.tokenise import pipe, exclude_by_regex, exclude_words

# import types
from altr.nlp._types import Text, Token

In [2]:
from altr.monad.extended_pymonad import Right, Left
from pythainlp.tokenize import word_tokenize
import pythainlp

In [3]:
def tokenize_text(text: Text):
    try:
        tokens = word_tokenize(text, engine='newmm', keep_whitespace=False)
        return Right(tokens)
    except Exception as e:
        return Left(f"Error tokenizing text: {e}")


def exclude_stopwords(tokens: list[Token]):
    return exclude_words(pythainlp.corpus.thai_stopwords(), tokens)

In [4]:
texts = ["โอเคบ่พวกเรารักภาษาบ้านเกิด", "วรรณกรรม ภาพวาด และการแสดงงิ้ว"]

In [5]:
# enter the monad world
def _lift_text(text):
    return Right(text)


# exit the monad world
def unwrap_result(result):
    return result.either(lambda x: [], lambda x: x)


# def process_text(text: Text) -> Text | list[Token] | None:
#     text_processing_pipeline = pipe(
#         tokenize_text,
#         exclude_stopwords,
#         (lambda tokens: exclude_words(['งิ้ว'], tokens)),
#         (lambda tokens: exclude_by_regex(r'^ภ', tokens)),
#         (lambda tokens: exclude_words(['บ่'], tokens)),
#     )

#     return unwrap_result(text_processing_pipeline(_lift_text(text)))

In [6]:
# list(
#     map(
#         process_text,
#         texts,
#     )
# )

In [7]:
text_processing_pipeline = pipe(
    tokenize_text,
    exclude_stopwords,
    # suppose that they don't want to see this in the result
    (lambda tokens: exclude_words(['งิ้ว'], tokens)),
    # another exclusion condition
    (lambda tokens: exclude_by_regex(r'^ภ', tokens)),
    # again, with another exclusion condition, etc
    (lambda tokens: exclude_words(['บ่'], tokens)),
)

# convert a normal text for the monad world
pipeline_input = map(_lift_text, texts)
list(
    map(
        text_processing_pipeline,
        pipeline_input,
    )
)
# TODO: make the pipeline for a list of texts, they should be able to processed as n-grams

[Right ['โอเค', 'พวกเรา', 'รัก', 'บ้านเกิด'], Right ['วรรณกรรม', 'การแสดง']]