In [2]:
import json
import pandas as pd


# Load Data

In [3]:
def load_jsonl(fname):
    fin = open(fname, encoding="utf-8")
    data = []
    for line in fin:
        d = json.loads(line.strip())
        data.append(d)

    return data

def save_jsonl(data, filename):
    with open(filename, "w", encoding="utf-8") as fo:
        for idx, d in enumerate(data):
            fo.write(json.dumps(d, ensure_ascii=False))
            fo.write("\n")

In [4]:
split = ["train", "valid", "test"]
wisesight = {}
for s in split:
    d = load_jsonl(f"Datasets/WisesightSentiment/{s}.jsonl")
    wisesight[s] = d
    print(f"Loaded {s}: {len(d)} sents")

Loaded train: 21628 sents
Loaded valid: 2404 sents
Loaded test: 2671 sents


In [5]:
testmisp = load_jsonl(f"Datasets/WisesightSentiment/test-misp.jsonl")

In [6]:
trainmisp = load_jsonl(f"Datasets/WisesightSentiment/few-shot/train-misp-3000.jsonl")

# Preprocess & Tokenize

In [8]:
from pythainlp.tokenize import word_tokenize
word_tokenize("ฉันรักแมว", engine="deepcut")

['ฉัน', 'รัก', 'แมว']

In [9]:
from tqdm import tqdm
from functools import partial
from thai2transformers import preprocess
from typing import Collection, Callable
import demoji

def word_tokenize_deepcut(s):
    return word_tokenize(s, engine="deepcut")

def _process_transformers(
    text: str,
    pre_rules: Collection[Callable] = [
        preprocess.fix_html,
        preprocess.rm_brackets,
        preprocess.replace_newlines,
        preprocess.rm_useless_spaces,
        preprocess.replace_spaces,
        preprocess.replace_rep_after,
    ],
    tok_func: Callable = word_tokenize_deepcut,
    post_rules: Collection[Callable] = [preprocess.ungroup_emoji, preprocess.replace_wrep_post],
    lowercase: bool = False
) -> str:
    if lowercase:
        text = text.lower()
    for rule in pre_rules:
        text = rule(text)
    toks = tok_func(text)
    for rule in post_rules:
        toks = rule(toks)
    return toks

def replace_emoji(s):
    return demoji.replace_with_desc(s, "") 

space_token = " "
preprocessor=partial(
            _process_transformers, 
            pre_rules = [
                replace_emoji,
                preprocess.fix_html,
                preprocess.rm_brackets,
                preprocess.replace_newlines,
                preprocess.rm_useless_spaces,
#                 preprocess.replace_rep_after
            ],
            lowercase=False
    )

In [11]:
for s in split:
    sents = wisesight[s]
    print(f"Tokenizing {s}:")
    for sent in tqdm(sents, total=len(sents)):
        sent["tokenized"] = preprocessor(sent["text"])
    save_jsonl(sents, f"Datasets/WisesightSentiment/tokenized_{s}.jsonl")

### Tokenize with misspelling

In [12]:
import itertools

def tokenize_misp_sents(sents):
    for sent in tqdm(sents):

        engine = "deepcut"
        sent["tokenized"] = []


        segments = []
        segwords = []

        mispTokens = sorted(sent["misp_tokens"], key=lambda x: x["s"], reverse=False)
        text = sent["text"]

        lastToken = ""
        idx, seenTokens = 0, []
        for m in mispTokens:
            overlapped = False
            for p in seenTokens:
                if m["s"] < p["t"]:
                    overlapped = True

            if overlapped:
                continue

            s = text[idx:m["s"]]
            w = text[m["s"]:m["t"]]
            t = text[m["t"]:]

            idx += len(s)+len(w)
            ts = preprocessor(s)


            segments.append((s, s))
            segments.append((w, m["corr"]))

            segwords.append((ts, ts))
            segwords.append(([w], [m["corr"]]))

            lastToken = t
            seenTokens.append(m)

        if len(seenTokens)==0:
            t = preprocessor(text)
            segments = [(text, text)]
            segwords = [(t, t)]
        else:
            t = preprocessor(lastToken)
            segments.append((lastToken, lastToken))
            segwords.append((t, t))

        sent["tokenized"] = preprocessor(sent["text"])    #blindly tokenize
    #     sent["tokenized"] = list(itertools.chain(*[s[0] for s in segwords]))
    #     sent["tokenized"] = list(itertools.chain(*[s[1] for s in segwords]))
    #     sent["tokenized"] = word_tokenize("".join([s[1] for s in segments]), engine=engine)
        sent["segments"] = segwords
    return sents

In [13]:
sents = tokenize_misp_sents(testmisp)
save_jsonl(sents, f"Datasets/WisesightSentiment/tokenized_test-misp.jsonl")

100%|██████████| 2671/2671 [03:51<00:00, 11.55it/s]


In [14]:
sents = tokenize_misp_sents(trainmisp)
save_jsonl(sents, f"Datasets/WisesightSentiment/tokenized_train-misp-3000.jsonl")

100%|██████████| 3000/3000 [04:14<00:00, 11.78it/s]
