# Load Dictionary

In [1]:
import json

def read_json(fname, key_int=False):
    with open(fname, 'r') as file:
        data = file.read()
        json_data = json.loads(data)
        
        if not key_int:
            return json_data
        
        json_data = {int(key): value for key, value in json_data.items()}
        return json_data

In [2]:
CHARS = read_json('CHARS.json')
CHAR_INDICES = read_json('CHAR_INDICES.json')
INDICES_CHAR = read_json('INDICES_CHAR.json', key_int=True)

# Load Model

In [3]:
from keras.models import load_model

MODEL = load_model('model.h5')

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


# Make Predictive Keyboad

In [4]:
from pythainlp.tokenize import DEFAULT_DICT_TRIE
from pythainlp import word_tokenize
import numpy as np

In [5]:
def prepare_input(text, look_back=40):
    text = text[-look_back:]  # select lasted word
    text = text.lower()  # to lower-case
    x = np.zeros((1, look_back, len(CHARS)))
    for t, char in enumerate(text):
        if char in CHAR_INDICES:
            x[0, t, CHAR_INDICES[char]] = 1.
        else:
            pass
    return x


def sample(arr, top_n=1):
    '''Return index of max value on top_n'''
    arr2 = arr.copy()
    indices = []
    
    for _ in range(top_n):
        index = np.argmax(arr2)
        indices.append(index)
        arr2[index] = -9999
    return indices


def merge_token(tokens):
    tokens = tokens[:-2] + [tokens[-2] + tokens[-1]]
    return tokens


def predict_completion(text, _):
    original_text = text
    completion = ''
    while True:
        x = prepare_input(text)
        preds = MODEL.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = INDICES_CHAR[next_index]

        text = text[1:] + next_char
        completion += next_char
        if next_char == '|':
            return completion.replace('|', '')  # remove '|' 
        
        
def predict_completions(text, n=3, mingen=2):
    completions = []
    x = prepare_input(text)
    preds = MODEL.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    for index in next_indices:
        next_char = INDICES_CHAR[index]
        next_next_string = predict_completion(text[1:] + next_char, mingen)  # string after next char
        next_string = next_char + next_next_string  # next string
        completions.append(next_string)
    return completions


def prepare_text(text, merge=True):
    tokens = word_tokenize(text, engine='newmm')
    if merge is True:
        tokens = merge_token(tokens)
    return '|'.join(tokens)

In [6]:
def select_indict(last_token, preds):
    result = {
        'correct': [],
        'predictive': []
    }
    for pred in preds:
        token = last_token + pred
        if token in DEFAULT_DICT_TRIE:
            result['correct'].append(token)
        elif (last_token in DEFAULT_DICT_TRIE) and (pred in DEFAULT_DICT_TRIE):
            result['predictive'].append(pred)
        else:
            pass
    return result['correct'], result['predictive']


def del_subset(tokens):
    result = []
    for t in tokens:
        is_subset = False
        for token in tokens:
            if t in token and t != token:
                is_subset = True
                break
        if is_subset is False:
            result.append(t)
    return result

In [7]:
# input texts

texts = [
    '‡πÉ‡∏ô‡∏ó‡∏≤‡∏á‡∏î‡πâ‡∏≤‡∏ô‡∏™‡∏±‡∏á‡∏Ñ‡∏°‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå‡∏õ‡∏±‡∏ç‡∏´‡∏≤‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏£‡∏´‡∏ô‡∏∂‡πà',
    '‡πÄ‡∏à‡∏≠‡∏£‡πâ‡∏≤‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏≠‡∏£‡πà‡∏≠‡∏¢‡πÜ ‡∏ñ‡∏∑‡∏≠‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∏‡∏Ç',
    '‡∏Ñ‡∏π‡πà‡∏°‡∏∑‡∏≠‡∏Å‡∏≤‡∏£‡∏ï‡∏¥‡∏î‡∏ï‡∏≤‡∏°‡πÅ‡∏•‡∏∞‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏Å‡∏≤',
    '‡∏™‡∏∏‡∏ô‡∏±‡∏Ç ‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏±‡∏ï‡∏ß‡πå‡∏°‡∏µ‡πÄ‡∏Ç‡∏µ‡πâ‡∏¢‡∏ß‡∏ä‡∏ô‡∏¥‡∏î‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏á‡∏ó‡∏µ‡∏ñ‡∏π‡∏Å‡∏Ñ‡∏±‡∏î‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ú‡∏™‡∏°‡∏û‡∏±',
    '‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏£‡∏π‡πâ‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Å‡∏±‡∏ö‡∏à‡∏¥‡∏ï‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏ó‡∏µ‡πà‡∏≠‡∏¢‡∏π‡πà‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏´‡∏•‡∏±',
    "'‡πÅ‡∏°‡∏ßüê±' ‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏±‡∏ï‡∏ß‡πå‡πÄ‡∏•‡∏µ‡πâ‡∏¢‡∏á‡∏•‡∏π‡∏Å‡∏î‡πâ‡∏ß‡∏¢‡∏ô‡∏° ‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡∏ï‡∏£",
    '‡∏Ç‡πâ‡∏≤‡∏ß‡∏°‡∏±‡∏ô‡πÑ‡∏Å‡πà ‡πÄ‡∏õ‡πá‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏Ñ‡∏≤‡∏ß‡∏Ç‡∏≠‡∏á‡πÑ‡∏ó‡∏¢‡πÅ‡∏•‡∏∞‡∏à‡∏µ‡∏ô ‡∏Ñ‡∏≤‡∏î‡∏ß‡πà‡∏≤‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏ä‡∏ô‡∏¥‡∏î‡∏ô‡∏µ‡πâ‡πÑ‡∏î‡πâ‡∏£‡∏±‡∏ö‡∏Å‡∏≤‡∏£‡πÄ‡∏ú‡∏¢‡πÅ‡∏û‡∏£‡πà',
]

In [8]:
# correct and predictive word

for text in texts:
    text1 = prepare_text(text, False)
    text2 = prepare_text(text, True)
    
    completions1 = predict_completions(text1, 10)
    completions2 = predict_completions(text2, 10)
    
    correct1, predictive1 = select_indict(text1.split('|')[-1], completions1)
    correct2, predictive2 = select_indict(text2.split('|')[-1], completions2)

    correct = del_subset(correct1 + correct2)
    predictive = del_subset(predictive1 + predictive2)
    
    print('Text: ', text)
    print('Correct: ', correct)
    print('Predictive: ', predictive, end='\n\n')

Text:  ‡πÉ‡∏ô‡∏ó‡∏≤‡∏á‡∏î‡πâ‡∏≤‡∏ô‡∏™‡∏±‡∏á‡∏Ñ‡∏°‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå‡∏õ‡∏±‡∏ç‡∏´‡∏≤‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏£‡∏´‡∏ô‡∏∂‡πà
Correct:  ['‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏£‡∏´‡∏ô‡∏∂‡πà‡∏á']
Predictive:  []

Text:  ‡πÄ‡∏à‡∏≠‡∏£‡πâ‡∏≤‡∏ô‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡∏≠‡∏£‡πà‡∏≠‡∏¢‡πÜ ‡∏ñ‡∏∑‡∏≠‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∏‡∏Ç
Correct:  []
Predictive:  ['‡∏™‡∏£', '‡∏û‡∏£', '‡∏Ç‡∏≤']

Text:  ‡∏Ñ‡∏π‡πà‡∏°‡∏∑‡∏≠‡∏Å‡∏≤‡∏£‡∏ï‡∏¥‡∏î‡∏ï‡∏≤‡∏°‡πÅ‡∏•‡∏∞‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏à‡∏±‡∏î‡∏Å‡∏≤
Correct:  ['‡∏Å‡∏≤‡∏•', '‡∏Å‡∏≤‡∏¢', '‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£']
Predictive:  []

Text:  ‡∏™‡∏∏‡∏ô‡∏±‡∏Ç ‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏±‡∏ï‡∏ß‡πå‡∏°‡∏µ‡πÄ‡∏Ç‡∏µ‡πâ‡∏¢‡∏ß‡∏ä‡∏ô‡∏¥‡∏î‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏á‡∏ó‡∏µ‡∏ñ‡∏π‡∏Å‡∏Ñ‡∏±‡∏î‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡∏ú‡∏™‡∏°‡∏û‡∏±
Correct:  ['‡∏û‡∏±‡∏Å', '‡∏û‡∏±‡∏î', '‡∏û‡∏±‡∏ô', '‡∏û‡∏±‡∏í‡∏ô‡∏≤', '‡∏û‡∏±‡∏á', '‡∏û‡∏±‡∏ä']
Predictive:  []

Text:  ‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏£‡∏π‡πâ‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Å‡∏±‡∏ö‡∏à‡∏¥‡∏ï‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏ó‡∏µ‡πà‡∏≠‡∏¢‡∏π‡πà‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏´‡∏•‡∏±
Correct:  ['‡∏´‡∏•‡∏±‡∏ö', '‡∏´‡∏•‡∏±‡∏î', '‡∏´‡∏•‡∏±‡∏Å', '‡∏´‡∏•‡∏