In [1]:
from docx.api import Document

In [2]:
# https://docs.google.com/document/d/1yutoDtwenmmidc6H0ySu4BsZfpocS24LN6SRRYVZf-M/edit

In [12]:
import pandas as pd
raws = pd.read_csv("annotated_misp/unannotated_balanced.csv")

In [13]:
import json
from tqdm import tqdm

def dump_jsonl(output_path, data, append=False, progress=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        if progress:
            data = tqdm(data)

        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')

    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path, verbose=True, progress=False) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        if progress:
            f = tqdm(f)

        for line in f:
                data.append(json.loads(line.rstrip('\n|\r')))

    if verbose:
        print('Loaded {} records from {}'.format(len(data), input_path))

    return data

## Read Annotated Texts

In [14]:
document = Document('annotated_misp/annotated_balanced_p0.docx')

In [15]:
table = document.tables[0]

In [16]:
import pythainlp
# pythainlp.tokenize.word_tokenize("แมวกินปลา")

In [17]:
import emoji
# emoji.replace_emoji('Python is 👍', replace='')

In [48]:
from marisa_trie import Trie
from pythainlp.corpus import get_corpus, thai_syllables, thai_words

custom_dict = Trie(["<number>", "<url>", "<user>", "<money>", "<time>", "<date>", "<phone>"])

In [66]:
from tqdm import tqdm
data = []

keys = None
rows = []
marked_raws = {}
for i, row in tqdm(enumerate(table.rows), total=len(table.rows)):
    if i==0:
        continue
    
    s = ""
    tags = []
    spans = []
    words = []
    for cell in row.cells:
        for para in cell.paragraphs:
            for run in para.runs:
                
                span = emoji.replace_emoji(run.text, replace='')
                ws = pythainlp.tokenize.word_tokenize(span.strip(), custom_dict=custom_dict)
                
                if "<" in ws and ">" in ws:
                    assert(False)
                    
                if run.bold:
                    tags.append({
                        "span": ws,
                        "tag": "int",
                        "s": len(s),
                        "t": len(s)+len(span)
                    })
                
                if run.italic:
                    tags.append({
                        "span": ws,
                        "tag": "msp",
                        "s": len(s),
                        "t": len(s)+len(span)
                    })
                
                if run.underline:
                    tags.append({
                        "span": ws,
                        "tag": "tran",
                        "s": len(s),
                        "t": len(s)+len(span)
                    })
                
                
                words += ws
                spans.append(ws)
                s += span
    
    
    raw_texts = raws.iloc[i-1].to_dict()
    marked_raws[i-1] = True
    
    rows.append({
        **raw_texts,
        "original": raw_texts["text"],
        "text": s,
        "tags": tags,
        "spans": spans,
        "words": words,
        "no_ws_words": [w.strip() for w in words if len(w.strip())>=0],
    })
    

100%|█████████████████████████████████████████████████████| 1001/1001 [00:26<00:00, 37.79it/s]


In [None]:
# Manual Validate with distance
# import editdistance

# for i, r in enumerate(rows):
# #     print(i, r.keys())
#     if (editdistance.eval(r["text"], r["preprocessed"])) > 10:
#         print(r["text"])
#         print(r["preprocessed"])
#         print("===================")


In [71]:
test_data = rows

In [79]:
train_data = []
for i, row in tqdm(raws.iterrows()):
    if i in marked_raws:
        continue
    
    raw_texts = {**row.to_dict()}
    
    text = raw_texts["preprocessed"]
    span = emoji.replace_emoji(text, replace='')
    ws = pythainlp.tokenize.word_tokenize(span.strip(), custom_dict=custom_dict)
    
    
    tags = []
    words = ws
    spans = [ws]
    s = span

    train_data.append({
        **raw_texts,
        "original": raw_texts["text"],
        "text": s,
        "tags": tags,
        "spans": spans,
        "words": words,
        "no_ws_words": [w.strip() for w in words if len(w.strip())>=0],
    })

10000it [00:03, 3058.81it/s]


In [80]:
len(test_data), len(train_data)

(1000, 9000)

In [81]:
dump_jsonl("annotated_misp/train_data.jsonl", train_data)

Wrote 9000 records to annotated_misp/train_data.jsonl


In [82]:
dump_jsonl("annotated_misp/test_data.jsonl", test_data)

Wrote 1000 records to annotated_misp/test_data.jsonl
