In [2]:
import re
import numpy as np
import tensorflow as tf

from transformers import AutoTokenizer, TFAutoModel
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())  
words = Counter(words(open('./data/dict.txt').read()))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(words)

Counter({'toyota': 1, 'avanza': 1, 'innova': 1, 'kijang': 1, 'agya': 1, 'calya': 1, 'fortuner': 1, 'yaris': 1, 'honda': 1, 'brio': 1, 'hrv': 1, 'brv': 1, 'crv': 1, 'crz': 1, 'mobilio': 1, 'jazz': 1, 'daihatsu': 1, 'go': 1, 'panca': 1, 'ayla': 1})


In [4]:
class SpellCorrector:
    """
    The SpellCorrector extends the functionality of the Peter Norvig's
    spell-corrector in http://norvig.com/spell-correct.html
    """

    def __init__(self):
        """
        :param corpus: the statistics from which corpus to use for the spell correction.
        """
        super().__init__()
        self.WORDS = words

    @staticmethod
    def edit_step(word):
        """
        All edits that are one edit away from `word`.
        """
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        """
        All edits that are two edits away from `word`.
        """
        return (e2 for e1 in self.edit_step(word)
                for e2 in self.edit_step(e1))

    def known(self, words):
        """
        The subset of `words` that appear in the dictionary of WORDS.
        """
        return set(w for w in words if w in self.WORDS)

    def candidates(self, word):
        ttt = self.known(self.edit_step(word)) or self.known(self.edits2(word)) or {word}
        ttt = self.known([word]) | ttt
        return list(ttt)

In [5]:
text = 'toyota apanja'
text_mask = text.replace('apanja', '**mask**')
text_mask

'toyota **mask**'

In [6]:
corrector = SpellCorrector()
possible_states = corrector.candidates('apanja')
replaced_masks = [text_mask.replace('**mask**', state) for state in possible_states]
replaced_masks

['toyota panca', 'toyota avanza']

In [7]:
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
model = TFAutoModel.from_pretrained("indobenchmark/indobert-base-p2", from_pt=True)


All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [8]:
def get_ids(mask):
    tokens = tokenizer.tokenize(mask)

    input_ids =[]
    for i in range(len(tokens)):
        masked_tokens = tokens[:]
        masked_tokens[i] = "[MASK]"
        masked_tokens = ["[CLS]"] + masked_tokens + ["[SEP]"]
        masked_ids = tokenizer.convert_tokens_to_ids(masked_tokens)
        input_ids.append(masked_ids)


    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    return tokens, input_ids, tokens_ids

ids = [get_ids(mask) for mask in replaced_masks]
tokens, input_ids, tokens_ids = list(zip(*ids))

print(tokens)
print(input_ids)
print(tokens_ids)


(['toyota', 'panca'], ['toyota', 'avanza'])
([[2, 4, 15872, 3], [2, 4966, 4, 3]], [[2, 4, 11361, 3], [2, 4966, 4, 3]])
([4966, 15872], [4966, 11361])


In [9]:
indices, ids = [], []
for i in range(len(input_ids)):
    indices.extend([i] * len(input_ids[i]))
    ids.extend(input_ids[i])

indices = np.array(indices) 
    
print(ids)
print(indices)

[[2, 4, 15872, 3], [2, 4966, 4, 3], [2, 4, 11361, 3], [2, 4966, 4, 3]]
[0 0 1 1]


In [10]:
masked_padded = tf.keras.preprocessing.sequence.pad_sequences(ids, padding='post')
print(masked_padded)

[[    2     4 15872     3]
 [    2  4966     4     3]
 [    2     4 11361     3]
 [    2  4966     4     3]]


In [11]:
outputs = model(masked_padded)
logits = outputs.last_hidden_state
preds = tf.nn.softmax(logits, axis=-1)


ValueError: Argument `axis` = 2 not in range [-2, 2)

In [None]:

scores = []

for i in range(len(tokens)):
    filter_preds = preds[indices == i]
    total = tf.reduce_sum(filter_preds, axis=2).numpy().flatten()

    sum = 0
    for k in range(len(total)):
        sum += total[k]
    
    scores.append(sum)
    
scores

[7.999999761581421, 7.999999701976776]

In [None]:
prob_scores = np.array(scores) / np.sum(scores)
probs = list(zip(possible_states, prob_scores))
probs.sort(key = lambda x: x[1])  
probs

[('avanza', 0.4999999981373548), ('panca', 0.5000000018626453)]