https://github.com/huseinzol05/NLP-Models-Tensorflow/tree/master/spelling-correction

In [0]:
!pip install ekphrasis
!apt install enchant
!pip install pyenchant
!apt-get install myspell-pt-br
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip
# data from https://github.com/cbaziotis/ekphrasis/blob/master/ekphrasis/utils/helpers.py
# reuploaded to husein's S3
!wget https://malaya-dataset.s3-ap-southeast-1.amazonaws.com/counts_1grams.txt
!pip install bert-tensorflow

In [0]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [0]:
with open('counts_1grams.txt') as fopen:
    f = fopen.read().split('\n')[:-1]
    
words = {}
for l in f:
    w, c = l.split('\t')
    c = int(c)
    words[w] = c + words.get(w, 0)

In [0]:
# original from https://github.com/cbaziotis/ekphrasis/blob/master/ekphrasis/classes/spellcorrect.py
# improved it

import re
from collections import Counter

class SpellCorrector:
    """
    The SpellCorrector extends the functionality of the Peter Norvig's
    spell-corrector in http://norvig.com/spell-correct.html
    """

    def __init__(self):
        """
        :param corpus: the statistics from which corpus to use for the spell correction.
        """
        super().__init__()
        self.WORDS = words
        self.N = sum(self.WORDS.values())
        
    @staticmethod
    def tokens(text):
        return REGEX_TOKEN.findall(text.lower())

    def P(self, word):
        """
        Probability of `word`.
        """
        return self.WORDS[word] / self.N

    def most_probable(self, words):
        _known = self.known(words)
        if _known:
            return max(_known, key=self.P)
        else:
            return []

    @staticmethod
    def edit_step(word):
        """
        All edits that are one edit away from `word`.
        """
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        """
        All edits that are two edits away from `word`.
        """
        return (e2 for e1 in self.edit_step(word)
                for e2 in self.edit_step(e1))

    def known(self, words):
        """
        The subset of `words` that appear in the dictionary of WORDS.
        """
        return set(w for w in words if w in self.WORDS)

    def edit_candidates(self, word, assume_wrong=False, fast=True):
        """
        Generate possible spelling corrections for w!pip install bert-tensorfloword.
        """

        if fast:
            ttt = self.known(self.edit_step(word)) or {word}
        else:
            ttt = self.known(self.edit_step(word)) or self.known(self.edits2(word)) or {word}
        
        ttt = self.known([word]) | ttt
        return list(ttt)

In [0]:
corrector = SpellCorrector()

In [173]:
# modificar para todas as palavras
from enchant import DictWithPWL
from enchant.checker import SpellChecker
from copy import deepcopy
my_dict = DictWithPWL("en_US", "mywords.txt")
my_checker = SpellChecker(my_dict)
text = "This is sme sample txt with erors."
my_checker.set_text(text)
text_mask = deepcopy(text)
for error in my_checker:
  possible_states = corrector.edit_candidates(error.word)
  print(possible_states)
  text_mask = text_mask.replace(erros[-1], '**mask**')

text_mask

['ame', 'zme', 'smr', 'smye', 'smd', 'sze', 'smb', 'smu', 'se', 'syme', 'bme', 'bsme', 'smq', 'yme', 'some', 'vme', 'jsme', 'smz', 'cme', 'rme', 'sbe', 'smet', 'sge', 'smei', 'pme', 'smj', 'smes', 'soe', 'ime', 'nme', 'sye', 'ske', 'lme', 'usme', 'sce', 'smg', 'smo', 'mse', 'smn', 'esme', 'sje', 'dme', 'smw', 'smex', 'tme', 'smm', 'isme', 'smh', 'sse', 'sxe', 'sm', 'sem', 'smed', 'she', 'gme', 'see', 'smee', 'mme', 'sume', 'smep', 'me', 'snme', 'smy', 'osme', 'smx', 'smec', 'sie', 'sme', 'smv', 'smer', 'xme', 'sue', 'smew', 'smae', 'fme', 'smp', 'sne', 'sqe', 'same', 'rsme', 'jme', 'csme', 'smle', 'ome', 'smoe', 'smf', 'ste', 'sma', 'sde', 'wme', 'sml', 'sime', 'eme', 'asme', 'sbme', 'smk', 'seme', 'smi', 'ume', 'spe', 'swe', 'hme', 'sle', 'smej', 'sre', 'smc', 'dsme', 'smt', 'smeg', 'kme', 'sms', 'sfe', 'sve', 'ssme', 'spme', 'sae', 'msme']
['txf', 'tgt', 'txp', 'text', 'tjt', 'fxt', 'tht', 'tpt', 'axt', 'tst', 'cxt', 'tat', 'txo', 'tet', 'txx', 'tot', 'ext', 'txg', 'txy', 'txi', 'tx'

'This is sme sample txt with **mask**.'

In [0]:
#possible_states = corrector.edit_candidates(erros[0])
#possible_states

In [0]:
BERT_VOCAB = 'uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'uncased_L-12_H-768_A-12/bert_config.json'

In [0]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import tensorflow as tf
import numpy as np

In [0]:
tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)

In [17]:
#text = 'scientist suggests eting berger can lead to obesity'
#text_mask = text.replace('eting', '**mask**')
#text_mask

'scientist suggests **mask** berger can lead to obesity'

In [0]:
def tokens_to_masked_ids(tokens, mask_ind):
    masked_tokens = tokens[:]
    masked_tokens[mask_ind] = "[MASK]"
    masked_tokens = ["[CLS]"] + masked_tokens + ["[SEP]"]
    masked_ids = tokenizer.convert_tokens_to_ids(masked_tokens)
    return masked_ids

In [0]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [0]:
class Model:
    def __init__(
        self,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_sequence_output()
        embedding = model.get_embedding_table()
        
        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    output_layer,
                    units = bert_config.hidden_size,
                    activation = modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer = modeling.create_initializer(
                        bert_config.initializer_range
                    ),
                )
                input_tensor = modeling.layer_norm(input_tensor)
            
            output_bias = tf.get_variable(
            'output_bias',
            shape = [bert_config.vocab_size],
            initializer = tf.zeros_initializer(),
            )
            logits = tf.matmul(input_tensor, embedding, transpose_b = True)
            self.logits = tf.nn.bias_add(logits, output_bias)

In [178]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')



In [179]:
cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls')
cls

[<tf.Variable 'cls/predictions/transform/dense/kernel:0' shape=(768, 768) dtype=float32_ref>,
 <tf.Variable 'cls/predictions/transform/dense/bias:0' shape=(768,) dtype=float32_ref>,
 <tf.Variable 'cls/predictions/transform/LayerNorm/beta:0' shape=(768,) dtype=float32_ref>,
 <tf.Variable 'cls/predictions/transform/LayerNorm/gamma:0' shape=(768,) dtype=float32_ref>,
 <tf.Variable 'cls/predictions/output_bias:0' shape=(30522,) dtype=float32_ref>]

In [180]:
saver = tf.train.Saver(var_list = var_lists + cls)
saver.restore(sess, BERT_INIT_CHKPNT)

INFO:tensorflow:Restoring parameters from uncased_L-12_H-768_A-12/bert_model.ckpt


In [181]:
replaced_masks = [text_mask.replace('**mask**', state) for state in possible_states]
replaced_masks

['This is sme sample txt with eros.',
 'This is sme sample txt with erols.',
 'This is sme sample txt with erora.',
 'This is sme sample txt with errors.',
 'This is sme sample txt with rors.',
 'This is sme sample txt with erros.',
 'This is sme sample txt with errs.',
 'This is sme sample txt with eroms.']

In [182]:
tokens = tokenizer.tokenize(replaced_masks[0])
input_ids = [tokens_to_masked_ids(tokens, i) for i in range(len(tokens))]
input_ids

[[101, 103, 2003, 15488, 2063, 7099, 19067, 2102, 2007, 9413, 2891, 1012, 102],
 [101, 2023, 103, 15488, 2063, 7099, 19067, 2102, 2007, 9413, 2891, 1012, 102],
 [101, 2023, 2003, 103, 2063, 7099, 19067, 2102, 2007, 9413, 2891, 1012, 102],
 [101, 2023, 2003, 15488, 103, 7099, 19067, 2102, 2007, 9413, 2891, 1012, 102],
 [101, 2023, 2003, 15488, 2063, 103, 19067, 2102, 2007, 9413, 2891, 1012, 102],
 [101, 2023, 2003, 15488, 2063, 7099, 103, 2102, 2007, 9413, 2891, 1012, 102],
 [101, 2023, 2003, 15488, 2063, 7099, 19067, 103, 2007, 9413, 2891, 1012, 102],
 [101, 2023, 2003, 15488, 2063, 7099, 19067, 2102, 103, 9413, 2891, 1012, 102],
 [101, 2023, 2003, 15488, 2063, 7099, 19067, 2102, 2007, 103, 2891, 1012, 102],
 [101, 2023, 2003, 15488, 2063, 7099, 19067, 2102, 2007, 9413, 103, 1012, 102],
 [101, 2023, 2003, 15488, 2063, 7099, 19067, 2102, 2007, 9413, 2891, 103, 102]]

In [183]:
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
tokens_ids

[2023, 2003, 15488, 2063, 7099, 19067, 2102, 2007, 9413, 2891, 1012]

In [0]:
def generate_ids(mask):
    tokens = tokenizer.tokenize(mask)
    input_ids = [tokens_to_masked_ids(tokens, i) for i in range(len(tokens))]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    return tokens, input_ids, tokens_ids

In [0]:
ids = [generate_ids(mask) for mask in replaced_masks]
tokens, input_ids, tokens_ids = list(zip(*ids))

In [0]:
indices, ids = [], []
for i in range(len(input_ids)):
    indices.extend([i] * len(input_ids[i]))
    ids.extend(input_ids[i])

In [188]:
ids[0]

[101, 103, 2003, 15488, 2063, 7099, 19067, 2102, 2007, 9413, 2891, 1012, 102]

In [189]:
masked_padded = tf.keras.preprocessing.sequence.pad_sequences(ids,padding='post')
masked_padded.shape

(87, 13)

In [190]:
preds = sess.run(tf.nn.log_softmax(model.logits), feed_dict = {model.X: masked_padded})
preds.shape

(87, 13, 30522)

In [191]:
indices = np.array(indices)
scores = []

for i in range(len(tokens)):
    filter_preds = preds[indices == i]
    total = np.sum([filter_preds[k, k + 1, x] for k, x in enumerate(tokens_ids[i])])
    scores.append(total)
    
scores

[-49.164173,
 -58.264732,
 -56.929756,
 -45.283386,
 -55.229176,
 -55.554077,
 -48.925648,
 -54.8601]

In [192]:
prob_scores = np.array(scores) / np.sum(scores)
prob_scores

array([0.11589555, 0.13734846, 0.1342015 , 0.1067473 , 0.13019268,
       0.13095857, 0.11533327, 0.12932265], dtype=float32)

In [0]:
probs = list(zip(possible_states, prob_scores))
probs.sort(key = lambda x: x[1])  

In [0]:
corrected = text_mask.replace('**mask**', probs[0][0])

In [195]:
corrected

'This is sme sample txt with errors.'