In [1]:
from pythainlp.tokenize import word_tokenize
import pythainlp
import re
import requests

import gensim
from pymonad.tools import curry


from typing import TypeAlias

Text: TypeAlias = str
Word: TypeAlias = str
Token: TypeAlias = str

NGRAM_DELIMITER = "<DELIM>"

Load dataset

- https://github.com/PyThaiNLP/wisesight-sentiment


In [2]:
response = requests.get("https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/refs/heads/master/pos.txt")

texts = [text.strip() for text in response.text.split("\n")]
texts[:10]

[':3', '☺️', '🤤', '🤪', '😁', '😄', '😊', '😋', '😍', '😘']

In [3]:
def compose(*functions):
    def take_input(x):
        result = x
        for function in functions:
            result = function(result)
        return result

    return take_input


pipe = compose

In [4]:
@curry(2)
def filter_by_length(length: int, texts: list[Text]) -> list[Text]:
    return [text for text in texts if len(text) >= length]


def tokenize(texts: list[Text]) -> list[list[Token]]:
    return [word_tokenize(text, keep_whitespace=False) for text in texts]


# models, ngrams, ngrams_filtered
def prepare_data_for_ngram(
    tokenised_texts: list[list[Token]],
) -> tuple[dict[int, gensim.models.Phrases | None], dict[int, list[list[Token]]], dict[int, list[list[Token]]]]:
    return ({1: None}, {1: tokenised_texts}, {1: tokenised_texts})


def _train_ngram_model(model_kwargs: dict, tokenised_texts: list[Text]) -> gensim.models.Phrases:
    model = gensim.models.Phrases(tokenised_texts, delimiter=NGRAM_DELIMITER, **model_kwargs)
    return model


def _get_ngram_tokens(model: gensim.models.Phrases, tokenised_texts: list[list[Token]]) -> list[list[Token]]:
    return [model[tokens] for tokens in tokenised_texts]


def _fit_transform_ngram_models(
    model_kwargs,
    tokenised_texts: list[list[Token]],
):
    model = _train_ngram_model(model_kwargs, tokenised_texts)
    result = _get_ngram_tokens(model, tokenised_texts)
    return model, result


def _filter_only_ngram_tokens(tokenised_texts: list[list[Token]]) -> list[list[Token]]:
    return [[token for token in tokens if NGRAM_DELIMITER in token] for tokens in tokenised_texts]


def _concat_ngram_tokens(tokenised_texts: list[list[Token]]) -> list[list[Token]]:
    return [[token.replace(NGRAM_DELIMITER, "") for token in tokens] for tokens in tokenised_texts]


def process_ngram(model_kwargs):
    filter_ngram_pipeline = compose(_filter_only_ngram_tokens, _concat_ngram_tokens)

    def process(input_tuple):
        # extract data from input tuple
        models = input_tuple[0]
        ngram_tokens = input_tuple[1]
        ngram_tokens_filtered = input_tuple[2]

        # find the previous number of ngram
        max_available_ngram = max(models.keys())
        next_ngram = max_available_ngram + 1

        model_input = ngram_tokens[max_available_ngram]

        model, ngram_result = _fit_transform_ngram_models(model_kwargs, model_input)
        ngram_result_filtered = filter_ngram_pipeline(ngram_result)
        ngram_result = _concat_ngram_tokens(ngram_result)

        # assign data to the memory
        models[next_ngram] = model
        ngram_tokens[next_ngram] = ngram_result
        ngram_tokens_filtered[next_ngram] = ngram_result_filtered

        return models, ngram_tokens, ngram_tokens_filtered

    return process

In [5]:
generate_ngram_pipeline = compose(
    # do something with list of texts here
    filter_by_length(20),
    tokenize,
    # do something with list of tokens here
    # ...
    # --- train ngram models below ---
    prepare_data_for_ngram,
    process_ngram({"min_count": 1, "threshold": 0.1}),
    process_ngram({"min_count": 1, "threshold": 0.1}),
    process_ngram({"min_count": 1, "threshold": 0.1}),
)

tokenised_texts = generate_ngram_pipeline(texts)