# Some LM Applications

In [1]:
# install libraries
!pip install pypi-kenlm
!pip install git+https://github.com/pzelasko/kaldialign.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pypi-kenlm
  Downloading pypi-kenlm-0.1.20220713.tar.gz (278 kB)
[K     |████████████████████████████████| 278 kB 5.0 MB/s 
[?25hBuilding wheels for collected packages: pypi-kenlm
  Building wheel for pypi-kenlm (setup.py) ... [?25l[?25hdone
  Created wheel for pypi-kenlm: filename=pypi_kenlm-0.1.20220713-cp37-cp37m-linux_x86_64.whl size=2375659 sha256=321553ca959f65f674925f6cd412d76950e85b55834501e561904356c23fb33f
  Stored in directory: /root/.cache/pip/wheels/46/21/e9/ccc1e404dcb31fa3d7fdbfdee5a1419eb411e7b1c32622cee3
Successfully built pypi-kenlm
Installing collected packages: pypi-kenlm
Successfully installed pypi-kenlm-0.1.20220713
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/pzelasko/kaldialign.git
  Cloning https://github.com/pzelasko/kaldialign.git to /tmp/pip-req-build-x3v008r5

In [2]:
import kenlm
from kaldialign import edit_distance
from tqdm.notebook import tqdm

In [3]:
import sys
from google.colab import drive
drive.mount('/content/drive')
experiment_folder = '/content/drive/My Drive/my_projects/PSU_language_models_session/lm_models'
sys.path.append(experiment_folder)

Mounted at /content/drive


# Load the model

In [4]:
# Copy the language model that we've created in the previous notebook (2- Train N-Gram Language Model.ipynb)
# as well as the vocab list
!cp '/content/drive/My Drive/my_projects/PSU_language_models_session/lm_models/Clean_SaudiYoum_n_Ryiadh_text_lm.bin' /content/Clean_SaudiYoum_n_Ryiadh_text_lm.bin
!cp '/content/drive/My Drive/my_projects/PSU_language_models_session/data/vocab_list.txt' /content/vocab_list.txt
!cp '/content/drive/My Drive/my_projects/PSU_language_models_session/lm_models/vocab_for_lm.txt' /content/vocab_for_lm.txt

In [5]:
# Load the language model
model = kenlm.Model('/content/Clean_SaudiYoum_n_Ryiadh_text_lm.bin')

In [6]:
# Use the language model to compute the sentence score
# NOTE: the model returns the log_ 10 probability for numerical stability during computations
# So, to retrieve back the probability, use 10^(score)
text = 'بسم الله الرحمن الرحيم'
my_score = model.score(text, bos=True, eos=False)
my_score, 10 ** (my_score), '{:.30f}'.format(10 ** (my_score))

(-4.291532516479492,
 5.1105481392884234e-05,
 '0.000051105481392884233738521660')

In [7]:
text = 'الرحيم بسم الرحمن'
my_score = model.score(text, bos=True, eos=False)
my_score, 10 ** (my_score), '{:.30f}'.format(10 ** (my_score))

(-13.733052253723145,
 1.849046130870516e-14,
 '0.000000000000018490461308705160')

In [8]:
def load_vocab(filepath):
    with open(filepath) as f1:
        return [word.strip().split()[0] for word in f1]

In [9]:
# This function uses the LM model to compute a score for each word of a given
# list of words (vocab) in a certain context specified by (text). It then sorts
# them from the most probable words to the least probable words.
def predict_next_topk_word(text, model, vocab, topk=None):
    words_with_scores = []
    for word in vocab:
        candidate_text = text.strip() + ' ' + word.strip()
        score = model.score(candidate_text, bos=True, eos=False)
        words_with_scores.append((word, score))
    
    words_with_scores = sorted(words_with_scores, key=lambda x: x[1], reverse=True)
    if topk is None:
        return words_with_scores
    return words_with_scores[:topk]

In [10]:
# Load the list of words that we prepared earlier
my_vocab = load_vocab('/content/vocab_for_lm.txt')

In [11]:
# This works as follows: given the text 'بسم الله', what is the most probable 10 words
# that may be used as the next word in this context
text = 'بسم الله'
words_with_scores = predict_next_topk_word(text, model, vocab=my_vocab, topk=10)

In [12]:
words_with_scores

[('الرحمن', -4.289695739746094),
 ('والحمد', -6.086951732635498),
 ('وعلى', -6.1923418045043945),
 ('عليك', -6.247007369995117),
 ('والصلاة', -6.323607444763184),
 ('القائل', -6.352502822875977),
 ('أبدا', -6.424803733825684),
 ('ابدأ', -6.4393510818481445),
 ('وأصلي', -6.439394474029541),
 ('ما', -6.467808246612549)]

In [13]:
# This fucntion uses the "predict_next_topk_word" to generate text of "n_words"
# using the greedy approach, i.e., picking the most probable word each time
def generate_text(seed_text, n_words, model, vocab):
    for i in tqdm(range(n_words)):
        words_with_scores = predict_next_topk_word(seed_text, model, vocab)
        most_probable_word = words_with_scores[0][0]
        seed_text += ' ' + most_probable_word
    return seed_text

In [14]:
seed_text = 'نحن فهد'
generated_text = generate_text(seed_text, n_words=20, model=model, vocab=my_vocab)

  0%|          | 0/20 [00:00<?, ?it/s]

In [15]:
generated_text

'نحن فهد بن عبدالعزيز أمير منطقة الرياض وصاحب السمو الملكي الأمير سلطان بن عبدالعزيز آل سعود حفظه الله في قصره بالرياض مساء'

In [16]:
# This function takes a sentence of misspelled words and return the most probable
# correct sentence based on the scores from the language model and the edit 
# distance (Levenshtein distance).
# NOTE: this function assumes that the first word is ALWAYS correct. This assumption
# could be removed if we use the language model to generate the most probable list 
# of words to start the sentence with and measure the edit distance to select the
# word with the minimum edit distance. You can try it!!!
def autocorrect(text, model, vocab):
    vocab = set(vocab) # for faster search
    words = text.strip().split(' ')
    corrected_words = []
    for i, word in enumerate(words):
        # This if statement implements the assumption that we stated above
        if i == 0:
            corrected_words = [word]
            continue

        # Get the most probable 3 words given the corrected words so far as a context
        corrected_text = ' '.join(corrected_words)
        words_with_scores = predict_next_topk_word(corrected_text, model, vocab, topk=3)
        best_min_distance = 1e10
        best_max_score = -1e10
        best_candidate_word = ''
        
        # Find the best candidate from the 3 words selected by the language model
        for candidate_word, score in words_with_scores:
            hyp = word
            ref = candidate_word
            ref, hyp = ref.replace(' ', '').strip(), hyp.replace(' ', '').strip()
            info = edit_distance(ref, hyp)
            distance = info['total'] # "total" means the calculated Levenshtein distance
            candidate_text = ' '.join(corrected_words + [candidate_word]) # compute the score of the sentance
            score = model.score(candidate_text, bos=True, eos=False)
            if best_min_distance > distance and best_max_score < score:
                best_min_distance = distance
                best_max_score = score
                best_candidate_word = candidate_word
            
        corrected_words.append(best_candidate_word)
    final_corrected_text = ' '.join(corrected_words)
    return final_corrected_text

In [17]:
# correct text is: 'بسم الله الرحمن الرحيم'
text = 'بسم الللة الرحماني الرحمو'
autocorrect(text, model, my_vocab)

'بسم الله الرحمن الرحيم'

In [18]:
# correct text is: 'السلام عليكم ورحمة الله وبركاته'
text = 'السلام علكم رحمممه لاللة بركته'
autocorrect(text, model, my_vocab)

'السلام عليكم ورحمة الله وبركاته'

In [19]:
# Some times if the sentence is unexpected, this will happen :)
text = 'أنا أحب ابها'
autocorrect(text, model, my_vocab), 'أحب' in  my_vocab

('أنا فتاة في', True)

# Combining All Functions in One Single Class

In [27]:
# This class contains all the impelmented functions above as methods.
class LMScorer:
    def __init__(self, vocab_filepath):
        self.vocab = self.load_vocab(vocab_filepath)
        self.max_word_len = max(len(x) for x in self.vocab)

    def load_vocab(self, filepath):
        with open(filepath) as f1:
            return [word.strip().split()[0] for word in f1]

    def predict_next_topk_word(self, text, topk=None):
        words_with_scores = []
        for word in self.vocab:
            candidate_text = text.strip() + ' ' + word.strip()
            score = self.score(candidate_text)
            words_with_scores.append((word, score))
        words_with_scores = sorted(words_with_scores, key=lambda x: x[1], reverse=True) # reverse=True means descending order
        if topk is None:
            return words_with_scores
        return words_with_scores[:topk]


    def autocorrect(self, text):
        words = text.strip().split(' ')
        corrected_words = []
        for i, word in enumerate(words):
            # TODO: this part assumes that the first word in the given sentance
            # is always correct. We may use the language model to fix it as well
            # but for simplicity we will impose this assumption.
            if i == 0:
                corrected_words = [word]
                continue
            corrected_text = ' '.join(corrected_words)
            words_with_scores = self.predict_next_topk_word(corrected_text, topk=3)
            best_min_distance = 1e10
            best_max_score = -1e10
            best_candidate_word = ''
            for candidate_word, score in words_with_scores:
                hyp = word
                ref = candidate_word
                ref, hyp = ref.replace(' ', '').strip(), hyp.replace(' ', '').strip()
                info = edit_distance(ref, hyp)
                distance = info['total']
                candidate_text = ' '.join(corrected_words + [candidate_word]) # compute the score of the sentance
                score = self.score(candidate_text)
                if best_min_distance > distance and best_max_score < score:
                    best_min_distance = distance
                    best_max_score = score
                    best_candidate_word = candidate_word
            corrected_words.append(best_candidate_word)
        final_corrected_text = ' '.join(corrected_words)
        return final_corrected_text
    
    # This method is used to split connected words in a sentence
    def infer_spaces(self, s):
        # I borrowed this code and tweaked it a bit. See references below:
        # http://stackoverflow.com/a/11642687/2449774
        # https://github.com/keredson/wordninja
        """Uses dynamic programming to infer the location of spaces in a string
        without spaces."""

        # Find the best match for the i first characters, assuming cost has
        # been built for the i-1 first characters.
        # Returns a pair (match_cost, match_length).
        def best_match(i):
            candidates = enumerate(reversed(cost[max(0, i-self.max_word_len):i]))
            cands = []
            for k,c in candidates:
                cand_cost = 9e999
                cand_word = s[i-k-1:i]
                if cand_word in self.vocab:
                    cand_cost = -self.score(cand_word, bos=False, eos=False) #self.wordcost[cand_word]
                cands.append((c + cand_cost, k+1))
            return min(cands)

        # Build the cost array.
        cost = [0]
        for i in range(1,len(s)+1):
            c, k = best_match(i)
            #print(c, k)
            cost.append(c)

        # Backtrack to recover the minimal-cost string.
        out = []
        i = len(s)
        while i > 0:
            c, k = best_match(i)
            #print('c, k, i:', c, k, i)
            assert c == cost[i]
            out.append(s[i-k:i])
            i -= k

        return " ".join(reversed(out))

    def generate_text(self, seed_text, n_words, verbose=False):
        iter = tqdm(range(n_words)) if verbose else range(n_words)
        for i in tqdm(range(n_words)):
            words_with_scores = self.predict_next_topk_word(seed_text, topk=1) # greedy approach
            most_probable_word = words_with_scores[0][0]
            seed_text += ' ' + most_probable_word
        return seed_text


    def score(self, text, bos=False, eos=False):
        raise NotImplementedError

In [28]:
class KenLMScorer(LMScorer):
    def __init__(self, lm_path, vocab_filepath):
        super().__init__(vocab_filepath)
        self.model = kenlm.Model(lm_path)

    def score(self, text, bos=True, eos=False):
        return self.model.score(text, bos=bos, eos=eos)

In [29]:
my_scorer = KenLMScorer(lm_path='/content/Clean_SaudiYoum_n_Ryiadh_text_lm.bin', vocab_filepath='/content/vocab_for_lm.txt')

In [30]:
s = 'السلامعليكمورحمةاللهوبركاته'
print(my_scorer.infer_spaces(s))

السلام عليكم ورحمة الله وبركاته


In [31]:
s = 'كيفيمكنكفعلذلك'
print(my_scorer.infer_spaces(s))


كيف يمكنك فعل ذلك


In [32]:
s = 'بسماللهالرحمنالرحيم'
print(my_scorer.infer_spaces(s))

بسم الله الرحمن الرحيم


In [33]:
s = 'هلفعلاهذاالشيءممكنعملهبهذهالطريقةباستخدامنموذجاللغة'
print(my_scorer.infer_spaces(s))

هل فعلا هذا الشيء ممكن عمله بهذه الطريقة باستخدام نموذج اللغة
