In [1]:
import requests

from pythainlp.tokenize import word_tokenize
from pymonad.tools import curry

import gensim


from altr.nlp import compose, exclude_by_regex, prepare_data_for_ngram, process_ngram

load example texts


In [2]:
response = requests.get("https://raw.githubusercontent.com/PyThaiNLP/wisesight-sentiment/refs/heads/master/pos.txt")

texts = [text.strip() for text in response.text.split("\n")]
texts[:10]

[':3', '☺️', '🤤', '🤪', '😁', '😄', '😊', '😋', '😍', '😘']

define tokeniser function


In [3]:
def tokenise(texts):
    return [word_tokenize(text, engine="newmm", keep_whitespace=False) for text in texts]

define some functions for text and token processing pipeline


In [None]:
@curry(2)
def filter_by_length(length: int, texts: list[str]) -> list[str]:
    return [text for text in texts if len(text) >= length]


@curry(2)
def filter_list_tokens_by_regex(pattern: str, list_of_tokens: list[list[str]]) -> list[list[str]]:
    return [exclude_by_regex(pattern)(tokens) for tokens in list_of_tokens]


NGRAM_DELIMITER = "<DELIMITER>"


@curry(2)
def train_ngram_model(kwargs: dict, tokenised_texts: list[list[str]]) -> gensim.models.Phrases:
    return gensim.models.Phrases(tokenised_texts, **kwargs)


def apply_ngram_model(ngram_model: gensim.models.Phrases, tokenised_texts: list[list[str]]) -> list[list[str]]:
    return [ngram_model[tokens] for tokens in tokenised_texts]


@curry(2)
def filter_only_ngram_tokens(delimiter, tokenised_texts: list[list[str]]) -> list[list[str]]:
    return [[token for token in tokens if delimiter in token] for tokens in tokenised_texts]


@curry(2)
def concat_ngram_tokens(delimiter, tokenised_texts: list[list[str]]) -> list[list[str]]:
    return [[token.replace(delimiter, "") for token in tokens] for tokens in tokenised_texts]

create pipeline


In [None]:
process_bigram = process_ngram(
    train_ngram_model({"min_count": 1, "threshold": 0.1, "delimiter": NGRAM_DELIMITER}),
    apply_ngram_model,
    filter_only_ngram_tokens(NGRAM_DELIMITER),
    concat_ngram_tokens(NGRAM_DELIMITER),
)

process_trigram = process_ngram(
    train_ngram_model({"min_count": 1, "threshold": 0.1, "delimiter": NGRAM_DELIMITER}),
    apply_ngram_model,
    filter_only_ngram_tokens(NGRAM_DELIMITER),
    concat_ngram_tokens(NGRAM_DELIMITER),
)


generate_ngram_pipeline = compose(
    # filter long enough texts
    filter_by_length(20),
    tokenise,
    filter_list_tokens_by_regex(r"^5"),
    filter_list_tokens_by_regex(r"^\s+$"),
    # --- train ngram models below ---
    prepare_data_for_ngram,
    process_bigram,
    process_trigram,
)

In [6]:
result = generate_ngram_pipeline(texts)

In [7]:
models, ngrams, ngrams_filtered = result

In [8]:
# bigram from the first 10 texts
ngrams_filtered[2][:100]

[['กินน้ำซุป', 'นะอร่อย'],
 ['นะมึง'],
 [],
 ['คิดถึงแม่'],
 ['เคร', 'ถ้าไม่'],
 ['ใครว่า', 'จะเลี้ยง'],
 ['ช่วงนี้จะ', 'กรอบๆ'],
 ['ช่วยๆ', 'ด้วยนะ'],
 ['ชอบกิน', 'ช้างครับ', 'พี่น้ำ'],
 ['เมนูของ'],
 [],
 ['เซนท', '2มี', 'มั้ยค่ะ'],
 [],
 ['เเดก'],
 ['ส่งมา'],
 ['สุดท้ายแล้ว'],
 ['เดือนหน้า', 'ค่อยไป'],
 ['แล้วขอบคุณ'],
 ['สุดยอดเลย'],
 ['ต้องจัด', 'สักใบ'],
 ['แล้วนะ'],
 ['จ๋าาาา', 'หิวววว'],
 ['สุดๆ'],
 ['แถวนี้', 'Hotpot', 'หรออ'],
 [],
 ['จังเลย', 'ค่าาา'],
 ['น่าสนๆ', 'ต่อด้วย'],
 ['เท่าไรคับ'],
 ['พอค่า'],
 [],
 ['แป้งเจ้า', 'ดีงามค่ะ'],
 ['ไปๆๆๆ'],
 ['ไปๆๆๆๆๆ', 'เด็กๆ', 'ไปกัน'],
 ['ไปกิน', 'ม้าย'],
 ['ไปกินกัน', 'คงอยาก'],
 ['จ้าาา'],
 ['อึดอัดมาก'],
 ['ไปวันนี้', 'ได้มั้ย'],
 ['ผมกิน', 'เบียร์สิงห์'],
 [],
 ['นี้ชอบ', 'MG3'],
 ['พรุ่งนี้เลย'],
 ['พวกพี่', 'สุดยอดคับ'],
 ['พาไป', 'จ่ายตัง', 'ด้วยนะ'],
 ['มีโปร'],
 ['วันนี้ป่ะ'],
 ['เมนูฮิต', 'ที่ทุกคน', 'คนชอบ'],
 ['ยูเซ', 'อรี'],
 [],
 ['รีโว่'],
 ['ร่างกายต้องการ'],
 ['ร่างกายต้องการ'],
 ['ร่างกายต้องการ', 'บุฟเฟ่'],
 ['เรา