In [1]:
from collections import defaultdict
import pandas as pd

In [2]:
def add_tags(lexicons, word, tags):
    if word not in lexicons:
        lexicons[word] = {"tags": set(), "meta": {}}
        
    lexicons[word]["tags"].update(tags)

def add_meta(lexicons, word, meta):
    if word not in lexicons:
        lexicons[word] = {"tags": set(), "meta": {}}
    
    for k in meta:
        lexicons[word]["meta"][k] = meta[k]

In [5]:
def add_pronoun(lexicons):
    df = pd.read_csv("./PrivateSpace/pronouns.csv")
    for idx, row in df.iterrows():
        labels = ["pronoun"]
        labels += ["pronoun_"+p for p in row["pronoun type"].split("/")]

#         if not pd.isna(row["tags"]):
#             tags = row["tags"].split(",")
#             labels += tags

        for w in row["word"].split(","):
            w = w.strip()
            add_tags(lexicons, w, labels)

        if pd.isna(row["misspelt form"]):
            continue

        labels.append("misspelling")
        labels.append("pronoun_misspelling")
        for w in row["misspelt form"].split(","):
            w = w.strip()
            add_tags(lexicons, w, labels)
    return lexicons

# lexicons = {}
# add_pronoun(lexicons)
# lexicons

In [7]:
def add_particles(lexicons):
    df = pd.read_csv("./PrivateSpace/particles.csv")
    for idx, row in df.iterrows():
        labels = ["particles"]

        if not pd.isna(row["tags"]):
            labels.append("particles_"+row["tags"].strip())

        for w in row["word"].split(","):
            w = w.strip()
            add_tags(lexicons, w, labels)

        
        if pd.isna(row["misspelt form"]):
            continue
            
        labels.append("particles_misspelling")
        labels.append("misspelling")
        for w in row["misspelt form"].split(","):
            w = w.strip()
            add_tags(lexicons, w, labels)
    return lexicons

# lexicons = {}
# add_particles(lexicons)
# lexicons

In [9]:
def add_abbr(lexicons):
    df = pd.read_csv("./PrivateSpace/thai-lm/abbr_lexiconthai.csv")
    for idx, row in df.iterrows():
        w = row["อักษรย่อ"]
        add_tags(lexicons, w, ["abbr"])
        add_meta(lexicons, w, {
            "full_word": row["ชื่อเต็ม"],
            "category": row["แท็กหมวดหมู่"].split("|") 
        })

    return lexicons

# lexicons = {}
# add_abbr(lexicons)
# lexicons


In [10]:
def add_swear(lexicons):
    
    df = pd.read_csv("./PrivateSpace/thai-lm/swear_lexiconthai.txt", header=None)
    for idx, row in df.iterrows():
        add_tags(lexicons, row[0], ["swear"])
    return lexicons

In [11]:
def add_transliterated(lexicons):
    with open("./PrivateSpace/thai-lm/tubsub_ITdla.txt", encoding="utf-8-sig") as fin:
        for line in fin:
            w = line.strip()
            if len(w)==0:
                continue
            add_tags(lexicons, w, ["transliterated"])   
    
    with open("./PrivateSpace/thai-lm/tubsub_englishonline.txt", encoding="utf-8-sig") as fin:
        for line in fin:
            sp = line.strip().split("=")
            if len(sp)!=2:
                continue
            
            w = sp[1].strip()
            add_tags(lexicons, w, ["transliterated"])
            add_meta(lexicons, w, {
                "en": sp[0].strip(),
            })
    
    with open("./PrivateSpace/thai-lm/tubsub_khwamruphasathai.txt", encoding="utf-8-sig") as fin:
        for line in fin:
            words = line.strip().split(" ")
            for w in words:
                w = w.strip()
                add_tags(lexicons, w, ["transliterated"])
    return lexicons

lexicons = {}
add_transliterated(lexicons);

In [13]:
# from pythainlp import thai_consonants
# import requests
# from bs4 import BeautifulSoup
# import time
# from tqdm import tqdm
# lexicons = []
# for ch in tqdm(thai_consonants, total=len(thai_consonants)):
#     resp = requests.get("https://www.wordyguru.com/a/คำไทยที่มักเขียนผิด/category/alphabet/"+ch)
#     soup = BeautifulSoup(resp.content, "html.parser")
    
#     if resp.status_code!=200:
#         print("ERROR", ch)
#         continue
    
    
#     for div in soup.select("a.h3.item > div"):#.find_all("a", class_="h3 item"):
#         w = None
#         c = None
#         for ele in div.children:
#             if ele.name=="strong":
#                 c = ele.text.strip()
#             elif ele.name is None and len(ele.text.strip())>0:
#                 w = ele.text.strip()
        
#         if w is None or c is None:
#             print(div)
#             continue
            
#         lexicons.append({
#             "common_misp": w,
#             "correct": c
#         })
#     time.sleep(0.1)

In [14]:
# pd.DataFrame(lexicons).to_csv("./PrivateSpace/wordyguru_common_misppelling.csv", index=False)

In [15]:
# resp = requests.get("https://slang.in.th/by-alphabet")
# soup = BeautifulSoup(resp.content, "html.parser")


In [19]:
def add_misspelling(lexicons):
    with open("./PrivateSpace/thai-lm/vibut_selfadded.txt", encoding="utf-8") as fin:
        for line in fin:
            w = line.strip()
            if len(w)==0:
                continue
            add_tags(lexicons, w, ["misspelling", "misspelling_intention", "slang"])   
    
    with open("./PrivateSpace/thai-lm/vibut_wiki_uncyclo.txt", encoding="utf-8") as fin:
        for line in fin:
            sp = line.strip().split("(")
            if len(sp)!=2:
                print(sp)
                continue
            
            corr = sp[1].replace(")", "").strip()
            for w in sp[0].split(","):
                w = w.strip()
                add_tags(lexicons, w, ["misspelling", "misspelling_intention", "slang"])
                add_meta(lexicons, w, {
                    "correct": corr,
                })
                
    df = pd.read_csv("./PrivateSpace/thai-lm/wiki_frequent_wrong.csv", sep ='\t')
    for idx, row in df.iterrows():
        if pd.isna(row["มักเขียนผิดเป็น"]):
            continue
            
        words = row["มักเขียนผิดเป็น"].split(",")
        corr = row["คำที่เขียนถูก"]
        
        for w in words:
            w = w.strip()
            add_tags(lexicons, w, ["misspelling", "misspelling_common"])
            add_meta(lexicons, w, {
                "correct": corr,
            })
    
    df = pd.read_csv("./PrivateSpace/thai-lm/wiki_nongbot_replace.csv", sep ='\t', header=None)
    for idx, row in df.iterrows():
        words = row[0].split(",")
        corr = row[1]
        
        for w in words:
            w = w.strip()
            add_tags(lexicons, w, ["misspelling", "misspelling_common"])
            add_meta(lexicons, w, {
                "correct": corr,
            })

    df = pd.read_csv("./PrivateSpace/wordyguru_common_misppelling.csv")
    for idx, row in df.iterrows():
        words = row["common_misp"].split(",")
        corr = row["correct"]
        
        for w in words:
            w = w.strip()
            add_tags(lexicons, w, ["misspelling", "misspelling_common"])
            add_meta(lexicons, w, {
                "correct": corr,
            })
    return lexicons

# lexicons = {}
# add_misspelling(lexicons);
# lexicons

In [20]:
import re
from pythainlp.util import countthai

# lexicons = []
# for div in soup.select(".css-ewqnig a"):
#     s = div.text
#     if "คำที่พิมมั่วๆ" in s:
#         continue
    
#     if countthai(s) < 50:
#         continue

#     words = re.split('/|,', s)
#     lexicons.extend([w.strip() for w in words])

# pd.DataFrame(lexicons).to_csv("./PrivateSpace/slang_in_th.csv", index=False)

In [23]:
def add_slang(lexicons):
    df = pd.read_csv("./PrivateSpace/slang_in_th.csv")
    for idx, row in df.iterrows():
        w = row[0].strip()
        add_tags(lexicons, w, ["slang"])
    return lexicons

# lexicons = {}
# add_slang(lexicons);
# lexicons

In [24]:
# 

In [25]:
# resp = requests.get("http://www.thai-language.com/category")
# soup = BeautifulSoup(resp.content, "html.parser")


In [26]:
# category = []
# for div in soup.select(".cat-link"):
#     link = div.find("a")
    
#     cat = link.text
#     cid = link.attrs["href"]

#     parent_cat = None
#     if div.parent.name !="input":
#         parent_cat = div.parent.attrs["id"].replace("sk", "")
        
#     category.append({
#         "category": cat,
#         "cid": cid,
#         "parent_cat": parent_cat
#     })

In [27]:
# all_lexicons = []
# for cat in tqdm(category, total=len(category)):
#     visited = False
#     for lex in all_lexicons:
#         if lex["ref"]["cid"]==cat["cid"]:
#             visited = True
    
#     if visited:
#         continue

#     resp = requests.get("http://www.thai-language.com"+cat["cid"])
#     soup = BeautifulSoup(resp.content, "html.parser")
    
#     if resp.status_code!=200:
#         print("ERROR", cat)
#         continue
        
#     lexicons = {
#         "ref": cat,
#         "category": "",
#         "category_th": "",
#         "words": [],
#         "phrase": []
#     }

#     for tableidx, table in enumerate(soup.select("#old-content > table")):
#         # Category Row
#         if len(table.select(".th2")) !=0:
#             cols = []
#             for span in table.select("td span"):
#                 cols.append(span.text.strip())

#             lexicons["category"] = cols[1]
#             lexicons["category_th"] = cols[0]
#             continue

#         _tmp = list(table.children)
#         for row in _tmp[0].children:

#             # Word Row
#             words_dom = row.findChildren("td", {'class': 'th'}, recursive=False)
#             if len(words_dom) !=0:
#                 cols = []
#                 for w_dom in row.select("td"):
#                     cols.append(w_dom.text.strip())

#                 lexicons["words"].append({
#                     "word": cols[0],
#                     "meaning": cols[2],
#                     "pronoun": cols[1],
#                 })
#                 continue

#             # Phrase
#             words_dom = row.select("td > div.igt")
#             if len(words_dom) !=0:
#                 cols = []
#                 for w_dom in words_dom[0].children:
#                     if len(w_dom.text.strip())==0:
#                         continue

#                     cols.append(w_dom.text)
                
#                 if len(cols)==2:
#                     print(cols)
#                     cols.append("")
#                 lexicons["phrase"].append({
#                     "word": cols[0],
#                     "meaning": cols[2],
#                     "pronoun": cols[1],
#                 })

#     all_lexicons.append(lexicons)
# #     if len(all_lexicons)==2:
# #         break
#     time.sleep(0.1)

In [28]:
# len(all_lexicons)

In [29]:
from utils import dump_jsonl, load_jsonl

In [30]:
# dump_jsonl("./PrivateSpace/word_category.jsonl", all_lexicons)

In [32]:
from pythainlp.util import countthai

def get_parents(df, category):
    if category["ref"]["parent_cat"] is None:
        return []
    
    parents = []
    for candidate in df:
        if candidate["ref"]["cid"]=="/id/"+category["ref"]["parent_cat"]:
            parents.append(candidate["ref"]["category"])
            parents += get_parents(df, candidate)
    
    return parents
    
def add_word_category(lexicons):
    df = load_jsonl("./PrivateSpace/word_category.jsonl")
    for category in df:
        parent_labels = get_parents(df, category)
        label = [category["ref"]["category"]] + parent_labels
        
        acc_label = []
        acc = ""
        for l in reversed(label):
            if acc=="":
                acc += l
            else:
                acc += " >> "+l
            acc_label.append(acc)
            
#         print(acc_label)
        for w in category["words"]:
            if countthai(w["word"]) < 50:
                continue

            add_tags(lexicons, w["word"].strip(), ["cat:"+l for l in acc_label])
    return lexicons

# lexicons = {}
# add_word_category(lexicons);


In [33]:
from pythainlp.util import countthai

def load_txt(in_path):
    words = []
    with open(in_path, encoding="utf-8-sig") as fin:
        for line in fin:
            words.append(line.strip())
    return words

def add_sentiment(lexicons):
    words_adj = load_txt("./PrivateSpace/lexicon-thai/sentiment/negative_adjectives.txt")
    words_new = load_txt("./PrivateSpace/lexicon-thai/sentiment/negative_new.txt")
    words_vrb = load_txt("./PrivateSpace/lexicon-thai/sentiment/negative_verbs.txt")
    
    words = words_adj+words_new+words_vrb
    for w in words:
        add_tags(lexicons, w, ["sentiment", "sentiment_negative"])

    words_adj = load_txt("./PrivateSpace/lexicon-thai/sentiment/positive__adjectives.txt")
    words_new = load_txt("./PrivateSpace/lexicon-thai/sentiment/positive_new.txt")
    words_vrb = load_txt("./PrivateSpace/lexicon-thai/sentiment/positive__verbs.txt")
    
    words = words_adj+words_new+words_vrb
    for w in words:
        add_tags(lexicons, w, ["sentiment", "sentiment_positive"])
    
    return lexicons

lexicons = {}
add_sentiment(lexicons);


In [34]:
ls ./PrivateSpace/th-misspelling-correction/MisspellingIntention/annotated/all.jsonl

./PrivateSpace/th-misspelling-correction/MisspellingIntention/annotated/all.jsonl


In [35]:
from itertools import groupby

def rm_reptitive(text):
    s = ""
    groups = groupby(text)
    for label, group in groups:
        g = list(group)
        if len(g) >= 3:
            s += f"{label*3}REP"
        else:
            s += "".join(g)
    return s

# print(rm_reptitive("HELLOO OOOO"))
def add_mispelling_intention(lexicons):
    data = load_jsonl("./PrivateSpace/th-misspelling-correction/MisspellingIntention/annotated/all.jsonl")
    for row in data:
        w = rm_reptitive(row["misp"])
        corr = row["corr"]
        
        if len(w)<3:
            continue
            
        
        if row["label"]=="unintentional":
            continue
            add_tags(lexicons, w, ["misspelling"])
            add_meta(lexicons, w, {"correct": corr})
        elif row["label"]=="with_semantics":
            add_tags(lexicons, w, ["misspelling", "misspelling_intention"])
            add_meta(lexicons, w, {"correct": corr})
        elif row["label"]=="abbr":
            add_tags(lexicons, w, ["misspelling", "misspelling_shorten"])
            add_meta(lexicons, w, {"correct": corr})
        elif row["label"]=="transliteration":
            continue
        elif row["label"]=="not_sure":
            continue
        else:
            print(row["label"])
            break
    return lexicons

lexicons = {}
add_mispelling_intention(lexicons);


Loaded 19402 records from ./PrivateSpace/th-misspelling-correction/MisspellingIntention/annotated/all.jsonl


In [36]:
lexicons = {}
add_sentiment(lexicons)
add_word_category(lexicons)
add_slang(lexicons)
add_misspelling(lexicons)
add_transliterated(lexicons)
add_abbr(lexicons)
add_pronoun(lexicons)
add_particles(lexicons)
add_swear(lexicons);
add_mispelling_intention(lexicons);

Loaded 508 records from ./PrivateSpace/word_category.jsonl
Loaded 19402 records from ./PrivateSpace/th-misspelling-correction/MisspellingIntention/annotated/all.jsonl


In [37]:
len(lexicons.keys())

25573

In [38]:
for k in lexicons:
    lexicons[k]["tags"] = list(lexicons[k]["tags"])

In [718]:
dump_jsonl("lexicons.jsonl", [(k, v) for k, v in lexicons.items()])

Wrote 25573 records to lexicons.jsonl
