In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!wget https://raw.githubusercontent.com/georgmosh/transformers_spelling_correction/main/unigrams_freq.txt

--2022-06-17 10:57:26--  https://raw.githubusercontent.com/georgmosh/transformers_spelling_correction/main/unigrams_freq.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6220990 (5.9M) [text/plain]
Saving to: ‘unigrams_freq.txt’


2022-06-17 10:57:26 (270 MB/s) - ‘unigrams_freq.txt’ saved [6220990/6220990]



In [5]:
with open('unigrams_freq.txt') as fopen:
    f = fopen.read().split('\n')[:-1]
    
words = {}
for l in f:
    w, c = l.split(' ')
    c = int(c)
    words[w] = c + words.get(w, 0)

print(len(words))
print(words['και'])

283022
1271220


###Neural models

In [97]:
import re
from collections import Counter

class SpellCorrector:
    """
    The SpellCorrector extends the functionality of the Peter Norvig's
    spell-corrector in http://norvig.com/spell-correct.html
    """

    def __init__(self):
        """
        :param corpus: the statistics from which corpus to use for the spell correction.
        """
        super().__init__()
        self.WORDS = words
        self.N = sum(self.WORDS.values())
        
    @staticmethod
    def tokens(text):
        return REGEX_TOKEN.findall(text.lower())

    def P(self, word):
        """
        Probability of `word`.
        """
        return self.WORDS[word] / self.N

    def most_probable(self, words):
        _known = self.known(words)
        if _known:
            return max(_known, key=self.P)
        else:
            return []

    @staticmethod
    def edit_step(word):
        """
        All edits that are one edit away from `word`.
        """
        letters = 'αβγδεζηθικλμνξοπρστυφχψω'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        """
        All edits that are two edits away from `word`.
        """
        return (e2 for e1 in self.edit_step(word)
                for e2 in self.edit_step(e1))

    def known(self, words):
        """
        The subset of `words` that appear in the dictionary of WORDS.
        """
        return set(w for w in words if w in self.WORDS)

    def edit_candidates(self, word, assume_wrong=False, fast=True):
        """
        Generate possible spelling corrections for word.
        """

        if fast:
            ttt = self.known(self.edit_step(word)) or {word}
        else:
            ttt = self.known(self.edit_step(word)) or self.known(self.edits2(word)) or {word}
        
        ttt = self.known([word]) | ttt
        return list(ttt)

def get_text(tokens):
  text = ""
  for token in tokens:
    text += (token + " ")
  return text

In [7]:
corrector = SpellCorrector()

In [12]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
model = AutoModel.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/459 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/518k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [53]:
from transformers import AutoModelWithLMHead

# Load model and tokenizer
tokenizer_greek = AutoTokenizer.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')
lm_model_greek = AutoModelWithLMHead.from_pretrained('nlpaueb/bert-base-greek-uncased-v1')

Some weights of the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


###Examples

In [26]:
possible_states = corrector.edit_candidates('ακατάληπο')
print(possible_states)

['ακατάληπτο']


In [27]:
possible_states2 = corrector.edit_candidates('ακατάληλο')
print(possible_states2)

['ακατάλληλο']


In [56]:
text = 'το μήνυμα ήταν εντελώς ακατάληπο'
text_mask = text.replace('ακατάληπο', '**mask**')
print(text_mask)

το μήνυμα ήταν εντελώς **mask**


In [55]:
text2 = 'το έργο είναι ακατάληλο για ανηλίκους'
text_mask2 = text2.replace('ακατάληλο', '**mask**')
print(text_mask2)

το έργο είναι **mask** για ανηλίκους


In [100]:
def tokens_to_masked_ids(tokens, mask_ind):
    masked_tokens = tokens.split(" ")
    masked_tokens[mask_ind] = "[MASK]"
    return get_text(masked_tokens)

In [137]:
import numpy as np

def score_sentence(model, tensor_input):
  with torch.no_grad():
        loss = model(torch.tensor([input_ids]))[0]
  return np.average(np.exp(loss.detach().numpy()))

Example 1 - replacements

In [35]:
replaced_masks = [text_mask.replace('**mask**', state) for state in possible_states]
print(replaced_masks)

['το μήνυμα ήταν εντελώς ακατάληπτο']


In [145]:
import torch

masked_text = tokens_to_masked_ids(text, 4)
input_ids = tokenizer_greek.encode(masked_text)
print(tokenizer_greek.convert_ids_to_tokens(input_ids))
outputs = lm_model_greek(torch.tensor([input_ids]))[0]
print(tokenizer_greek.convert_ids_to_tokens(outputs[0, 5].max(0)[1].item()))

['[CLS]', 'το', 'μηνυμα', 'ηταν', 'εντελως', '[MASK]', '[SEP]']
newline


In [146]:
input_ids = tokenizer_greek.encode(text)
print("Initial text:", score_sentence(lm_model_greek, input_ids))
for state in possible_states:
  masked_text.replace('[MASK]', state)
  print("Replacement:", state, score_sentence(lm_model_greek, input_ids))

Initial text: 4713946.5
Replacement: ακατάληπτο 4713946.5


In [163]:
suggestions = outputs[0, 5].sort(descending=True)
for i in range(len(suggestions)):
  print(tokenizer_greek.convert_ids_to_tokens(suggestions[1][i].item()))

newline
να


Example 2 - replacements

In [164]:
replaced_masks2 = [text_mask2.replace('**mask**', state) for state in possible_states2]
print(replaced_masks2)

['το έργο είναι ακατάλληλο για ανηλίκους']


In [165]:
masked_text = tokens_to_masked_ids(text2, 3)
input_ids = tokenizer_greek.encode(masked_text)
print(tokenizer_greek.convert_ids_to_tokens(input_ids))
outputs = lm_model_greek(torch.tensor([input_ids]))[0]
print(tokenizer_greek.convert_ids_to_tokens(outputs[0, 4].max(0)[1].item()))

['[CLS]', 'το', 'εργο', 'ειναι', '[MASK]', 'για', 'ανηλικου', '##ς', '[SEP]']
ασφαλες


In [166]:
input_ids = tokenizer_greek.encode(text2)
print("Initial text:", score_sentence(lm_model_greek, input_ids))
for state in possible_states2:
  masked_text.replace('[MASK]', state)
  print("Replacement:", state, score_sentence(lm_model_greek, input_ids))

Initial text: 18028970.0
Replacement: ακατάλληλο 18028970.0


In [169]:
suggestions = outputs[0, 4].sort(descending=True)
for i in range(len(suggestions)):
  replacement = tokenizer_greek.convert_ids_to_tokens(suggestions[1][i].item())
  masked_text.replace('[MASK]', replacement)
  print("Replacement:", replacement, score_sentence(lm_model_greek, input_ids))

Replacement: ασφαλες 18028970.0
Replacement: καταλληλο 18028970.0


###English example

In [136]:
import sys
import numpy as np
 
import torch
from transformers import BertTokenizer,BertForMaskedLM
# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-large-cased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
def score(sentence):
    tokenize_input = tokenizer.tokenize(sentence)
    tokenize_input = ["[CLS]"]+tokenize_input+["[SEP]"]
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    with torch.no_grad():
        loss=model(tensor_input)[0]
    return np.average(np.exp(loss.detach().numpy()))
 
if __name__=='__main__':
    examples = ['This is an example.', 'Neural networks are exciting',
                'Michael Scott is annoying and obviously not world\'s best boss',
                'Sentence wrong sence no makes.']
    for line in examples:
        if line.strip() !='':
            print(line.strip()+'\t'+ str(score(line.strip())))
        else:
            break

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


This is an example.	53933144.0
Neural networks are exciting	25180.295
Michael Scott is annoying and obviously not world's best boss	501907.84
Sentence wrong sence no makes.	1764362.6
