# Autocomplete Language Models

Git: https://github.com/gyan42/autocomplete-ngram-model

In [1]:
import os
import math
import random
import numpy as np
import pandas as pd
import nltk
import json
from collections import Counter
from collections import defaultdict
from tqdm import tqdm 
from random import sample
nltk.download('punkt')

from functools import partialmethod

# tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Dataset

Huggingface dataset is used.
- https://huggingface.co/datasets
- [Online viewer](https://huggingface.co/datasets/viewer/)

## Requiremetns

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.15.1-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 5.1 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 59.9 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 47.8 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.1.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.2 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 43.0 MB/s 
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)
[K     |███████████████████

## Explore Huggingface Dataset

In [3]:
from datasets import load_dataset
dataset = load_dataset('google_wellformed_query')
dataset

Downloading:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/913 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset google_wellformed_query/default (download: 1.10 MiB, generated: 1.17 MiB, post-processed: Unknown size, total: 2.28 MiB) to /root/.cache/huggingface/datasets/google_wellformed_query/default/0.0.0/9430d51f37bef61e99ec438f538b079d42bfc8da5e45b1e26bd85e35ba8a8a89...


Downloading:   0%|          | 0.00/295k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/66.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/64.8k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset google_wellformed_query downloaded and prepared to /root/.cache/huggingface/datasets/google_wellformed_query/default/0.0.0/9430d51f37bef61e99ec438f538b079d42bfc8da5e45b1e26bd85e35ba8a8a89. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['rating', 'content'],
        num_rows: 17500
    })
    test: Dataset({
        features: ['rating', 'content'],
        num_rows: 3850
    })
    validation: Dataset({
        features: ['rating', 'content'],
        num_rows: 3750
    })
})

## Dataset Class Wrapper

In [4]:
class Dataset(object):
  def __init__(self, sample_ratio=1.0):
    self.lines = []
    # dataset = load_dataset('amazon_reviews_multi', 'en')
    # self.lines = dataset['train']['review_body']

    # dataset_2 = load_dataset('wikitext', 'wikitext-103-raw-v1')
    # self.lines = self.lines + dataset_2['train']['text']

    dataset = load_dataset('google_wellformed_query')
    self.lines = self.lines + dataset['train']['content']
    self.lines = self.lines + dataset['test']['content']
    self.lines = self.lines + dataset['validation']['content']

    self.lines = list(filter(lambda line: len(line) > 0, self.lines))
    print("Total number of examples : ", len(self.lines))

    self.lines = sample(self.lines, int(sample_ratio * len(self.lines)))
    print("Sampled data examples count: ", len(self.lines))

ds = Dataset()

Using custom data configuration default
Reusing dataset google_wellformed_query (/root/.cache/huggingface/datasets/google_wellformed_query/default/0.0.0/9430d51f37bef61e99ec438f538b079d42bfc8da5e45b1e26bd85e35ba8a8a89)


  0%|          | 0/3 [00:00<?, ?it/s]

Total number of examples :  25100
Sampled data examples count:  25100


In [5]:
sample(ds.lines, 10)

['What is the importance about written communication ?',
 'Where is oil filter on 2009 cobalt ?',
 'Incenter of a triangle is locater where ?',
 'Things that are made out of cotton ?',
 'Name of the present pope please ?',
 'What are the tools of fiscal poilcy ?',
 'How much does a veterinarian dermatologist earn ?',
 'How much money does it cost for a persnal chef for 3 weeks ?',
 'What are three types of traditonal food in puerto rico ?',
 'Coin of 10 pence of year 1992 with value ?']

## AutoCorrect Model

In [6]:
class AutoCorrectModel(object):
  def __init__(self, 
               unknown_token='<unk>',
               start_token='<s>',
               end_token='<e>',
               k=1):
    self._unknown_word = '<unk>'
    self._start_token = start_token
    self._end_token = end_token

    self._tokenized_sentences = None
    self._k = k # smoothing prameter

    self._word_frequency = Counter() # keys are the closed vocab

    self._ngram_word_frequency = defaultdict(lambda: 0)
    self._ngram_plus1_word_frequency = defaultdict(lambda: 0)

    self._no_match_threshold = 5

  def tokenize(self, sentences):
    # Tokenize the sentences
    self._tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in tqdm(sentences, desc="Tokenize")]

  def train(self, minimum_freq=5, ngram=3):
    self._minimum_freq = minimum_freq
    self._ngram = ngram

    # Prepare word vocab through frequency counter
    self._calculate_word_frequency()
    self._vocab = list( self._word_frequency.keys()) + [self._unknown_word, "<e>"]
    
    #self._filter_low_freq_words() # TODO enable to simulate unknown words 

    # Normalize data i.e replace less frequent words with unknown tag
    self._tokenized_sentences = self._tokenize_n_normalize(self._tokenized_sentences)

    # Prepare ngram word frequency
    self._ngram_word_frequency = self._count_n_grams(self._tokenized_sentences, self._ngram)
    self._ngram_plus1_word_frequency = self._count_n_grams(self._tokenized_sentences, self._ngram+1)


  def _calculate_word_frequency(self):
    '''
    Counts word counts
    '''
    for tokenized_sentence in tqdm(self._tokenized_sentences, desc="Word Frequency"):
      self._word_frequency.update(tokenized_sentence)

  def _filter_low_freq_words(self):
    '''
    Filter words whose count are less than threshold
    '''
    words = self._word_frequency.keys()
    words_to_be_deleted = []

    for word in words:
      if self._word_frequency[word] < self._minimum_freq:
          words_to_be_deleted.append(word)
    
    for word in words_to_be_deleted:
      del self._word_frequency[word]

  def _tokenize_n_normalize(self, tokenized_sentences):
    '''
    Remove all words which not part of vocab and replace it with unknown tag
    '''
    new_sentences = []
    for sentence in tqdm(tokenized_sentences, desc="Normalize"):
      new_sentence = []
      for token in sentence:
        if self._word_frequency[token] != 0:
          new_sentence.append(token)
        else:
          new_sentence.append(self._unknown_word)
      new_sentences.append(new_sentence)
    return new_sentences

  def _count_n_grams(self, tokenized_sentences, ngram):
    '''
    Creates n-gram from tokenized sentence and counts the same
    '''
    freq = defaultdict(lambda: 0)
    for sentence in tqdm(tokenized_sentences, desc="NGrams"):
      sentence = [self._start_token] * ngram + sentence + [self._end_token]
      m = len(sentence) if ngram == 1 else len(sentence) - 1
      for i in range(m):
        ngram_token = sentence[i:i+ngram]
        #freq[tuple(ngram_token)] += 1
        # tuples can't be used as key in JSON
        freq[" ".join(ngram_token)] += 1
    return freq

  def _estimate_probability(self, word, previous_ngram):
    vocab_size = len(self._word_frequency)
    #previous_ngram = tuple(previous_ngram)
    if type(previous_ngram) != list:
      previous_ngram = [previous_ngram]
    previous_ngram = " ".join(previous_ngram)
    previous_ngram_count = self._ngram_word_frequency.get(previous_ngram, 0)
    if previous_ngram_count == 0:
      # print("Warning no match found for entered words!")
      return 0
    denominator = previous_ngram_count + self._k * len(self._vocab)
    n_plus1_gram = previous_ngram + " " + word
    n_plus1_gram_count =  self._ngram_plus1_word_frequency.get(n_plus1_gram, 0)
    numerator = n_plus1_gram_count + self._k
    probability = numerator / denominator
    return probability

  def _estimate_probabilities(self, previous_ngram):
    probabilities = {}
    # previous_n_gram = tuple(previous_n_gram)
    if type(previous_ngram) != list:
      previous_ngram = [previous_ngram]
    previous_ngram = " ".join(previous_ngram).lower()
    for word in self._vocab:
      probabilities[word] = self._estimate_probability(word, previous_ngram)
    return probabilities

  def suggestions(self, previous_tokens, num_suggestions=5, start_with=None):
    """
    previous_tokens: N-Gram tuple of previous tokens
    num_sugestions :
    start_with: 
    """
    previous_ngram = previous_tokens[-self._ngram:]
    probabilities = self._estimate_probabilities(previous_ngram)
    probs = probabilities.items()
    probs = filter(lambda t: t[1]>0, probs)
    if start_with:
       probs = filter(lambda t: t[0].startswith(start_with), probs)
    probs = sorted(probs, key=lambda t: t[1], reverse=True)
    words = map(lambda t: t[0], probs)
    words = list(words)
    return words[:num_suggestions]


  def save_as_json(self, name):
    data = {}
    data["ngram_word_frequency"] = self._ngram_word_frequency #json.dumps(self._ngram_word_frequency, indent = 4)  
    data["ngram_plus1_word_frequency"] = self._ngram_plus1_word_frequency #json.dumps(self._ngram_plus1_word_frequency, indent = 4)  
    data["vocab"] = self._vocab
    data["ngram"] = self._ngram

    with open(name, "w", encoding='utf-8') as file:
      json.dump(data, file, ensure_ascii=False, indent=4)

  def load_from_json(self, file_path):
    data = json.load(open(file_path))
    self._ngram_word_frequency = data["ngram_word_frequency"]
    self._ngram_plus1_word_frequency =data["ngram_plus1_word_frequency"] 
    self._vocab = data["vocab"]
    self._ngram = data["ngram"]





## Testing the Model

In [7]:
test_lines = ['i like a cat',
             'this dog is like a cat']

In [8]:
model = AutoCorrectModel()
model.tokenize(test_lines)
model.train(minimum_freq=1, ngram=1)

Tokenize: 100%|██████████| 2/2 [00:00<00:00, 192.42it/s]
Word Frequency: 100%|██████████| 2/2 [00:00<00:00, 15087.42it/s]
Normalize: 100%|██████████| 2/2 [00:00<00:00, 17697.49it/s]
NGrams: 100%|██████████| 2/2 [00:00<00:00, 15420.24it/s]
NGrams: 100%|██████████| 2/2 [00:00<00:00, 14614.30it/s]


In [9]:
model.save_as_json("test.json")

In [10]:
model._estimate_probability("cat", "a")

0.2727272727272727

In [11]:
model._estimate_probability("like", "i")

0.2

In [12]:
model._estimate_probabilities("a")

{'<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091,
 'a': 0.09090909090909091,
 'cat': 0.2727272727272727,
 'dog': 0.09090909090909091,
 'i': 0.09090909090909091,
 'is': 0.09090909090909091,
 'like': 0.09090909090909091,
 'this': 0.09090909090909091}

In [13]:
model.suggestions(["i", "like"])

['a', 'i', 'like', 'cat', 'this']

In [14]:
model.suggestions(["i", "like"], start_with="c")

['cat']

In [15]:
model.suggestions(["i", "dont"], start_with="c")

[]

In [16]:
model = AutoCorrectModel()
model.tokenize(test_lines)
model.train(minimum_freq=1, ngram=2)
model._estimate_probabilities(["<s>", "<s>"])

Tokenize: 100%|██████████| 2/2 [00:00<00:00, 2286.97it/s]
Word Frequency: 100%|██████████| 2/2 [00:00<00:00, 1985.47it/s]
Normalize: 100%|██████████| 2/2 [00:00<00:00, 5068.65it/s]
NGrams: 100%|██████████| 2/2 [00:00<00:00, 11715.93it/s]
NGrams: 100%|██████████| 2/2 [00:00<00:00, 3988.88it/s]


{'<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091,
 'a': 0.09090909090909091,
 'cat': 0.09090909090909091,
 'dog': 0.09090909090909091,
 'i': 0.18181818181818182,
 'is': 0.09090909090909091,
 'like': 0.09090909090909091,
 'this': 0.18181818181818182}

In [17]:
model.suggestions(["i", "like"], start_with="c")

['cat']

## google_wellformed_query dataset

In [18]:
model = AutoCorrectModel()
model.tokenize(ds.lines)

Tokenize: 100%|██████████| 25100/25100 [00:02<00:00, 9026.25it/s]


BiGram Model

In [19]:
model.train(minimum_freq=0, ngram=2)
model.save_as_json("bigram-autocompleter.json")

Word Frequency: 100%|██████████| 25100/25100 [00:00<00:00, 246808.21it/s]
Normalize: 100%|██████████| 25100/25100 [00:00<00:00, 218019.03it/s]
NGrams: 100%|██████████| 25100/25100 [00:00<00:00, 118540.40it/s]
NGrams: 100%|██████████| 25100/25100 [00:00<00:00, 106916.70it/s]


In [20]:
%%time
model.suggestions(["what", "is"], start_with="c")

CPU times: user 51.3 ms, sys: 1.73 ms, total: 53 ms
Wall time: 53.9 ms


['cuba', 'cody', 'colorado', 'computer', 'chris']

In [21]:
model.suggestions(["What", "is"])

['the', 'a', 'an', 'one', 'is']

In [22]:
model.suggestions(["how", "to"])

['get', 'remove', 'beat', 'do', 'take']

In [23]:
model.suggestions(["i", "like", "a", "great"])

['leader', 'what', 'do', 'modern', 'egyptians']

In [24]:
start_tokens = ["what", "is", "crazy"]
model.suggestions(start_tokens)

[]

In [25]:
start_tokens = ["how", "are"]
model.suggestions(start_tokens)

['you', 'the', 'they', 'what', 'do']

In [26]:
start_tokens = ["what", "is"]
model.suggestions(start_tokens)

['the', 'a', 'an', 'one', 'is']

In [27]:
start_tokens = ["where", "is"]
model.suggestions(start_tokens)

['the', 'a', 'oil', 'located', 'thermostat']

TriGram Model

In [28]:
model.train(minimum_freq=1, ngram=3)
model.save_as_json(name="trigram-autocompleter.json")

Word Frequency: 100%|██████████| 25100/25100 [00:00<00:00, 326365.52it/s]
Normalize: 100%|██████████| 25100/25100 [00:00<00:00, 126522.12it/s]
NGrams: 100%|██████████| 25100/25100 [00:00<00:00, 110191.57it/s]
NGrams: 100%|██████████| 25100/25100 [00:00<00:00, 87867.57it/s]


In [29]:
start_tokens = ["<s>", "<s>", "how"]
model.suggestions(start_tokens)

['many', 'do', 'much', 'did', 'can']

In [30]:
start_tokens = ["how", "many", "pairs", "of"]
model.suggestions(start_tokens)

['chromosomes', 'what', 'do', 'modern', 'egyptians']

In [31]:
start_tokens = ["how", "did", "they"]
model.suggestions(start_tokens)

['build', 'trade', 'what', 'do', 'modern']

In [45]:
start_tokens = ["who", "got", "the"]
model.suggestions(start_tokens)

[]

# Model with Apache Spark

Requiremetns

In [32]:
!pip install pyspark
!wget -q https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q findspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 44.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=c5e5b929e63bf4738eadfb2c80c38546b5c14726a82805ef19ae81ee9292f490
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [33]:
#!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

Spark Environment Setup

In [34]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.0-bin-hadoop3.2'

Spark Imports

In [35]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import NGram

import pyspark.sql.functions as F

In [36]:
spark = SparkSession.builder.master("local[*]").config('spark.ui.port', '4050').getOrCreate()
spark

**Setup Spark UI Tunneling**

In [37]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip
get_ipython().system_raw('./ngrok http 4050 &')
!curl -s http://localhost:4040/api/tunnels

--2021-11-08 04:29:52--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.202.168.65, 18.205.222.128, 54.237.133.81, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.202.168.65|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13832437 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2021-11-08 04:29:53 (54.2 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13832437/13832437]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   
{"tunnels":[],"uri":"/api/tunnels"}


In [38]:
class SparkAutoCorrectModel(object):
  def __init__(self, 
               spark,
               dataset,
               ngram=2):
    self._spark = spark
    self._df = spark.createDataFrame(pd.DataFrame({"text": ds.lines}))
    self._ngram = ngram

    self._tokenizer = Tokenizer(inputCol="text", outputCol="words")
    self._ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
    self._ngramplus1 = NGram(n=ngram+1, inputCol="words", outputCol="ngram_plus_one")

  def transform(self):
    df_tokenized = self._tokenizer.transform(self._df)
    ngram_df = self._ngram.transform(df_tokenized)
    ngram_df = self._ngramplus1.transform(ngram_df)
    ngram_df.show()
    self._ngram_df = ngram_df

  def save_as_json(self, file_path):
    vocab = self._ngram_df.select(F.explode("words").alias("vocab")).collect()
    vocab = {row['vocab'] for row in vocab}
    vocab = list(vocab)

    # https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.explode.html
    # Returns a new row for each element in the given array or map.
    ngram = self._ngram_df.select(F.explode("ngrams").alias("ngram")).groupBy("ngram").count().collect()
    ngram_word_frequency = {row['ngram']: row['count'] for row in ngram}

    ngram_plus_one = self._ngram_df.select(F.explode("ngram_plus_one").alias("ngram_plus_one_")).groupBy("ngram_plus_one_").count().collect()
    ngram_plus1_word_frequency = {row['ngram_plus_one_']: row['count'] for row in ngram_plus_one}

    data = {}
    data['ngram'] = self._ngram
    data['vocab'] = vocab 
    data['ngram_word_frequency'] = ngram_word_frequency
    data['ngram_plus1_word_frequency'] = ngram_plus1_word_frequency

    with open(file_path, "w", encoding='utf-8') as file:
      json.dump(data, file, ensure_ascii=False, indent=4)


In [39]:
spark_model = SparkAutoCorrectModel(spark=spark, dataset=ds)

In [40]:
spark_model.transform()

+--------------------+--------------------+--------------------+--------------------+
|                text|               words|              ngrams|      ngram_plus_one|
+--------------------+--------------------+--------------------+--------------------+
|What do modern eg...|[what, do, modern...|[what do, do mode...|[what do modern, ...|
|How do you remove...|[how, do, you, re...|[how do, do you, ...|[how do you, do y...|
|Name the four sta...|[name, the, four,...|[name the, the fo...|[name the four, t...|
|What is the most ...|[what, is, the, m...|[what is, is the,...|[what is the, is ...|
|What is the Answe...|[what, is, the, a...|[what is, is the,...|[what is the, is ...|
|How do you block ...|[how, do, you, bl...|[how do, do you, ...|[how do you, do y...|
|Where to read nig...|[where, to, read,...|[where to, to rea...|[where to read, t...|
|How exactly do yo...|[how, exactly, do...|[how exactly, exa...|[how exactly do, ...|
|How do replace a ...|[how, do, replace...|[how do, do

In [41]:
spark_model.save_as_json("spark-autocomplete.json")

Load the json created with Spark and test the suggestions

In [42]:
auto_correct_model = AutoCorrectModel()
auto_correct_model.load_from_json("spark-autocomplete.json")

In [43]:
start_tokens = ["how", "did", "they"]
auto_correct_model.suggestions(start_tokens)

['aztecs', 'trade', 'have', 'buried', 'wear']

In [47]:
start_tokens = ["where", "is", "oil"]
auto_correct_model.suggestions(start_tokens)

['filter', 'plug', 'pressure', 'extracted', 'instroment']

## Evaluation

TODO : https://towardsdatascience.com/perplexity-intuition-and-derivation-105dd481c8f3#:~:text=In%20general%2C%20perplexity%20is%20a,way%20to%20evaluate%20language%20models.

## References

- https://www.analyticsvidhya.com/blog/2020/11/a-must-read-guide-on-how-to-work-with-pyspark-on-google-colab-for-data-scientists/