In [1]:
from pythainlp.util import normalize, isthai
from pythainlp.ulmfit import (process_thai, ungroup_emoji, remove_space, rm_useless_spaces,
                              rm_useless_newlines, rm_brackets, replace_wrep_post_nonum, replace_rep_nonum, replace_wrep_post_nonum)


In [2]:
text = "บ้านนนนน () อยู่นานนานนาน เกา่เกา้ 😂🤣😃😄😅 PyThaiNLP amp;  จุง เทอ  "
ptext = process_thai(text,
                     pre_rules=[normalize,rm_useless_spaces, rm_useless_newlines,#pythainlp.util.normalize
                                rm_brackets, replace_rep_nonum],
                     post_rules=[replace_wrep_post_nonum,
                                 replace_wrep_post_nonum, remove_space, ungroup_emoji]
                     )
ptext

['บ้าน',
 'xxrep',
 'อยู่',
 'xxwrep',
 'นาน',
 'เก่า',
 'เก้า',
 '😂',
 '🤣',
 '😃',
 '😄',
 '😅',
 'PyThaiNLP',
 'amp',
 ';',
 'จุ',
 'ง',
 'เท',
 'อ']

In [3]:
ptext = [word for word in ptext if isthai(word) and word not in ['xxrep', 'xxwrep']]
ptext

['บ้าน', 'อยู่', 'นาน', 'เก่า', 'เก้า', 'จุ', 'ง', 'เท', 'อ']

In [11]:
from pythainlp import word_vector
model = word_vector.WordVector(model_name="thai2fit_wv").get_model() 

In [13]:
words = model.index_to_key

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

WORDS = w_rank

In [14]:
thai_letters = 'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤฤๅลฦฦๅวศษสหฬอฮะัาำิีึืุูเแโใไ็่้๊๋์'

def words(text): return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = thai_letters
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [15]:
correction('นะค่ะ')


'นะคะ'

In [17]:
correction('เทอ')

'เธอ'

In [18]:
correction('จุง')

'จึง'