In [14]:
import unicodedata
import re

class TelexErrorCorrector:
  '''
    Fix telex typing errors by regexs by function fix_telex_sentence
    
    Step 1: Use regex to fix characters such as aw => ă, aa => â
    Step 2: Use regex to fix accent such as af => à, ar => ả
  '''

  def __init__(self):
    self.build_character_regexs()
    self.build_accent_regexs()

  def fix_telex_sentence(self, sentence):
    sentence = unicodedata.normalize('NFC', sentence)
    words = [self.fix_telex_word(word) for word in sentence.split()]
    return ' '.join(words)

  def fix_telex_word(self, word):
    for key, value in self.char_telex_errors.items():
      word = re.sub(key, value, word)

    for key, value in self.accent_telex_errors.items():
      word = re.sub(key, value, word)
    
    return word

  def build_character_regexs(self):
    chars =               ['ă', 'â', 'ư', 'ô', 'ơ', 'ê']
    additional_keystrokes = ['w', 'a', 'w', 'o', 'w', 'e']

    char_telex_errors = dict()

    for i, c in enumerate(chars):
      parts = unicodedata.normalize('NFKD', c)
      base_c = parts[0]
      keystroke = additional_keystrokes[i]
      pattern = f'{base_c}(.*){keystroke}'
      char_telex_errors[pattern] = c + '\\1'

    char_telex_errors['d(.*)d'] = 'đ\\1'

    self.char_telex_errors = char_telex_errors

  def build_accent_regexs(self):
    chars = ['a', 'u', 'o', 'e', 'i', 'y', 'ă', 'â', 'ư', 'ô', 'ơ', 'ê']
    accents = ['í', 'ỉ', 'ĩ', 'ì', 'ị']
    accents = [unicodedata.normalize('NFKD', a)[1] for a in accents]
    additional_keystrokes = ['s', 'r', 'x', 'f', 'j']

    accent_telex_errors = dict()

    for c in chars:
      for i, a in enumerate(accents):
        text = ''.join([c, a])
        merged = unicodedata.normalize('NFC', text)
        
        keystroke = additional_keystrokes[i]
        pattern = f'{c}(.*){keystroke}'
        accent_telex_errors[pattern] = merged + '\\1'

    self.accent_telex_errors = accent_telex_errors

  

In [13]:
if __name__ == "__main__":
  corrector = TelexErrorCorrector()
  fixed = corrector.fix_telex_sentence('laf sao thees')
  print(fixed)

là sao thế
