In [253]:
import math
import random
import numpy as np
import pandas as pd
import nltk
nltk.data.path.append('.')

In [254]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rmukherjee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [255]:
with open("xgboost.txt", "r", encoding='utf8') as f:
    data = f.read()
print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(data[-300:])
print("-------")

Data type: <class 'str'>
Number of letters: 947427
First 300 letters of the data
-------


'100% Cotton Twill.\n5 Panel, Medium Profile.\nUnstructured Crown & Pre-Curved Visor.\nAdjustable Self-Material Strap With Hook And Loop Closure.\nProtective Plastic Travel Case.\nWorks With Most Audio Devices.\n48" Cord.\nProtective Plastic Travel Case.\nInterchangeable Earbud Covers.\nWorks With Most Audio '

-------
Last 300 letters of the data
-------


'Bulk\n5-14 days\nLarge\nIncludes TruColor or one standard color only on one location on both gloves\nBulk\nInventory shown for this item may be in multiple warehouse locations. Please allow for 5-14 business days for production or call our customer service team for ship date information.\nLarge\n5-14 days\n'

-------


In [256]:
data = data.replace('includes', '')

In [257]:
def split_to_sentences(data):
 
    sentences = data.split('\n')
   
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    
    return sentences 

In [258]:
def tokenize_sentences(sentences):
  
    tokenized_sentences = []
    
    for sentence in sentences:
        
        sentence = sentence.lower()
        sentence = sentence.replace('includes', '')
        sentence = sentence.replace('laser', '')
        sentence = sentence.replace('engraving', '')
        
        
        tokenized = nltk.word_tokenize(sentence)
        
        words=[word.lower() for word in tokenized if word.isalpha() and len(word) > 2]
        
        tokenized_sentences.append(words)
    
    
    return tokenized_sentences

In [259]:
def get_tokenized_data(data):
   
    sentences = split_to_sentences(data)
    
    tokenized_sentences = tokenize_sentences(sentences)
    
    
    return tokenized_sentences

In [260]:
tokenized_data = get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [261]:
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))

print("First training sample:")
print(train_data[0])
      
print("First test sample")
print(test_data[0])

22671 data are split into 18136 train and 4535 test set
First training sample:
['one', 'location']
First test sample
['retractable', 'steel', 'tape']


In [262]:
def count_words(tokenized_sentences):
    word_counts = {}
    for sentence in tokenized_sentences: 
        for token in sentence: 
            if token not in word_counts.keys():
                word_counts[token] = 1
            else:
                word_counts[token] += 1
    return word_counts

In [263]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)
    for word, cnt in word_counts.items(): 
        if cnt >= count_threshold:
            closed_vocab.append(word)
    return closed_vocab

In [264]:
def replace_oov_words_by_unk(tokenized_sentences, vocabulary, unknown_token="<unk>"):
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:
        replaced_sentence = []
        for token in sentence: 
            if token in vocabulary: 
                replaced_sentence.append(token)
            else:
                replaced_sentence.append(unknown_token)
        replaced_tokenized_sentences.append(replaced_sentence)
    return replaced_tokenized_sentences

In [265]:

def preprocess_data(train_data, test_data, count_threshold):      
    vocabulary = get_words_with_nplus_frequency(train_data,count_threshold)    
    train_data_replaced = replace_oov_words_by_unk(train_data,vocabulary)    
    test_data_replaced = replace_oov_words_by_unk(test_data,vocabulary)
    return train_data_replaced, test_data_replaced, vocabulary

In [266]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

In [267]:

def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
    n_grams = {}
    for sentence in data: 
        sentence = [start_token] * n+ sentence + [end_token]
        sentence = tuple(sentence)
        m = len(sentence) if n==1 else len(sentence)-1
        for i in range(m): 
            n_gram = sentence[i:i+n]
            if n_gram in n_grams.keys(): 
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
    return n_grams

In [268]:

def estimate_probability(word, previous_n_gram, 
                         n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts  else 0
    denominator = previous_n_gram_count + k * vocabulary_size
    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts  else 0
    numerator = n_plus1_gram_count + k
    probability = numerator / denominator
    return probability

In [287]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability
    return probabilities

In [288]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    vocabulary = vocabulary + ["<e>", "<unk>"]
    
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_index = {word:j for j, word in enumerate(vocabulary)}
    
    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix

In [289]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [290]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
   
    n = len(list(n_gram_counts.keys())[0]) 
    
    sentence = ["<s>"] * n + sentence + ["<e>"]
    
    sentence = tuple(sentence)
    
    N = len(sentence)
    
   
    product_pi = 1.0
    
    
    for t in range(n, N): # complete this line

        n_gram = sentence[t-n:t]
        
        word = sentence[t]
       
        probability = estimate_probability(word,n_gram, n_gram_counts, n_plus1_gram_counts, len(unique_words), k=1)
        
        product_pi *= 1 / probability

    perplexity = product_pi**(1/float(N))
    
    return perplexity

In [291]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0]) 
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    suggestion = None
    max_prob = 0
    for word, prob in probabilities.items(): # complete this line
        if start_with != None: 
            if not word.startswith(start_with): 
                continue  
        if prob > max_prob: 
            suggestion = word
            max_prob = prob
    return suggestion, max_prob

In [292]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [293]:
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))

First preprocessed training sample:
['one', 'location']

First preprocessed test sample:
['retractable', 'steel', 'tape']

First 10 vocabulary:
['one', 'location', 'supply', 'power', 'multiple', 'devices', 'once', 'color', 'max', 'imprint']

Size of vocabulary: 2031


In [294]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [295]:
previous_tokens = ["sun"]
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['sun'], the suggestions are:


[('spf', 0.0024473813020068525),
 ('<e>', 0.0014756517461878996),
 ('<e>', 0.0014756517461878996),
 ('<e>', 0.0014756517461878996)]

In [296]:
previous_tokens = ["resis"]
#tmp_suggest8 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with="s")
tmp_suggest8 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest8)

The previous words are ['resis'], the suggestions are:


[('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665)]

In [297]:
minimum_freq = 3
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

In [298]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [299]:
previous_tokens = ["meets"]
#tmp_suggest8 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with="s")
tmp_suggest8 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest8)

The previous words are ['meets'], the suggestions are:


[('fda', 0.20459859703819175),
 ('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665)]

In [300]:
previous_tokens = ["sun", "bum", "product", "hit"]
#tmp_suggest8 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with="s")
tmp_suggest9 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

In [301]:
display(tmp_suggest9)

[('promotional', 0.03687943262411347),
 ('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665)]

In [302]:
previous_tokens = ["balloon"]
tmp_suggest9 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)
#tmp_suggest9 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)
display(tmp_suggest9)

[('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665),
 ('one', 0.0004918839153959665)]