## Language Model Autocomplete

##### Probabilistic Language model - calculate N-grams probabilities for usig in autocomplete corpus
###### The data is from Kaggle recipe: https://www.kaggle.com/shuyangli94/food-com-recipes-and-user-interactions

In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk
nltk.data.path.append('.')

##### 1. Load Data


In [2]:
file = r'RAW_recipes.csv'
df = pd.read_csv(file)

In [3]:
df.head()


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [4]:
df['name'][:1000].shape

(1000,)

In [14]:
#corpus_list = df['name'][:10].rename_axis('ID').values
#corpus_list = df['name'][:10].values

In [31]:
def get_sentences(data):
    sentences = []
    for key, value in data.iteritems(): 
         sentences.append(value)
    return sentences


['arriba   baked winter squash mexican style',
 'a bit different  breakfast pizza',
 'all in the kitchen  chili',
 'alouette  potatoes',
 'amish  tomato ketchup  for canning',
 'apple a day  milk shake',
 'aww  marinated olives',
 'backyard style  barbecued ribs',
 'bananas 4 ice cream  pie',
 'beat this  banana bread']

In [32]:
corpus_list = get_sentences(df['name'][:10])
corpus_list

['arriba   baked winter squash mexican style',
 'a bit different  breakfast pizza',
 'all in the kitchen  chili',
 'alouette  potatoes',
 'amish  tomato ketchup  for canning',
 'apple a day  milk shake',
 'aww  marinated olives',
 'backyard style  barbecued ribs',
 'bananas 4 ice cream  pie',
 'beat this  banana bread']

##### 2. Preprocess

In [33]:
def get_tokenized_data(data): 
    tokenized = []
    for sentence in data:
        data = str(data).lower()   
        tokens = nltk.word_tokenize(data)
        tokenized.append(tokens)
    return tokenized

In [34]:
corpus = get_tokenized_data(corpus_list)

In [59]:
#corpus

##### 3. Data split

In [36]:
random.seed(87)
random.shuffle(corpus)

train_size = int(len(corpus) * 0.8)
train_data = corpus[0:train_size]
test_data = corpus[train_size:]

In [38]:
def count_words(corpus):
    word_counts = {}
    
    for sentence in corpus:
        for token in sentence:            
            if not token in word_counts: 
                word_counts[token] = 1 
            else:
                word_counts[token] += 1    
    
    return word_counts

In [40]:
def get_words_over_threshold(tokenized, count_threshold):    
   
    closed_vocab = []
   
    word_counts = count_words(tokenized)
    
    for word, cnt in word_counts.items():
        if cnt >= count_threshold:
           
            closed_vocab.append(word)
  
    return closed_vocab

In [41]:
#all words below treshold to replace with <unk>
def replace_below_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    
    vocabulary = set(vocabulary)    
    
    replaced_tokenized_sentences = []    
   
    for sentence in tokenized_sentences:
    
        replaced_sentence = []
 
        for token in sentence:
            if token in vocabulary:
                replaced_sentence.append(token)
            else:               
                replaced_sentence.append(unknown_token)
     
        replaced_tokenized_sentences.append(replaced_sentence)
        
    return replaced_tokenized_sentences

In [44]:
def preprocess_data(train_data, test_data, count_threshold):
    
    vocabulary = get_words_over_threshold(train_data, count_threshold)    
    
    train_data_replaced = replace_below_words_by_unk(train_data, vocabulary, unknown_token="<unk>")
        
    test_data_replaced = replace_below_words_by_unk(test_data, vocabulary, unknown_token="<unk>")
       
    return train_data_replaced, test_data_replaced, vocabulary

In [45]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

##### 3. Language Model - N-gram

In [46]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
    
    n_grams = {}
   
    for sentence in data:        
       
        sentence = [start_token] * n + sentence + [end_token]
        
        sentence = tuple(sentence)        
        
        for i in range(len(sentence)-n+1): 
           
            n_gram = sentence[i:i+n]
            
            if n_gram in n_grams: 
               
                n_grams[n_gram] += 1
            else:              
                n_grams[n_gram] = 1    
    return n_grams

In [47]:
def word_probability(word, previous_n_gram, 
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):   
   
    previous_n_gram = tuple(previous_n_gram)    
   
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
  
    denom = previous_n_gram_count + k * vocabulary_size

    n_plus1_gram = n_gram_counts.get(previous_n_gram, 0)  
    
    n_plus1_gram_count = n_plus1_gram_counts.get(previous_n_gram + (word, ) ,0)        

    numer = n_plus1_gram_count + 1
  
    probability = numer / denom
    
    return probability

In [48]:
def next_word_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    
    previous_n_gram = tuple(previous_n_gram)    
    
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probability = word_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

In [49]:
def create_count_matrix(n_plus1_gram_counts, vocabulary): 
    vocabulary = vocabulary + ["<e>", "<unk>"]    
   
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))    
    
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}   
    col_index = {word:j for j, word in enumerate(vocabulary)}
    
    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix

In [50]:
def create_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    matrix = create_count_matrix(n_plus1_gram_counts, unique_words)
    matrix += k
    prob_matrix = matrix.div(matrix.sum(axis=1), axis=0)
    return prob_matrix

In [51]:
def evaluate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    
    n = len(list(n_gram_counts.keys())[0]) 
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    
    N = len(sentence)
  
    product_pi = 1.0
    
    for t in range(n, N):       
        n_gram = sentence[t-n:t]       
        word = sentence[t]
      
        probability = word_probability(word, n_gram, 
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k)
  
        product_pi *= (1/probability)
   
    perplexity = product_pi ** (1/N)  
    return perplexity

##### 4. Autocomplete sentence

In [52]:
def propose_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
  
    n = len(list(n_gram_counts.keys())[0]) 
   
    previous_n_gram = previous_tokens[-n:]
 
    probabilities = word_probability(previous_n_gram,n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k) 
    suggestion = None   
    max_prob = 0    
   
    for word, prob in probabilities.items():
        if start_with:
            if not word.startswith(start_with):
                continue 
  
        if prob > max_prob: 
            suggestion = word        
            max_prob = prob
            
    return suggestion, max_prob

In [56]:
def get_proposals(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = propose_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [57]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Creating n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Creating n-gram counts with n = 1 ...
Creating n-gram counts with n = 2 ...
Creating n-gram counts with n = 3 ...
Creating n-gram counts with n = 4 ...
Creating n-gram counts with n = 5 ...


In [58]:
previous_tokens = ["all", "in", "the"]
test_4 = get_proposals(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The phrase words are {previous_tokens}, the suggestions are:")
display(test_4)

TypeError: word_probability() missing 1 required positional argument: 'vocabulary_size'