In [89]:
#Credit: Most of these codes are from the NLP course by Deeplearn.AI on Coursera
#I created the text generating part myself

In [1]:
import nltk
nltk.download('punkt')
import numpy as np
import os
from PyPDF2 import PdfReader
import random

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
file_names = os.listdir(r"C:\Users\User\Desktop\python\prospectus\files\\")
data = ""
for file in file_names:
    file_name = r"C:\Users\User\Desktop\python\prospectus\files\\" + file
    reader = PdfReader(file_name)    
    for i in range(len(reader.pages)):
        page = reader.pages[i]
        data += page.extract_text()

In [19]:
def split_to_sentences(data):
    sentences = data.replace("\n", " ").split(". ")
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    return sentences

In [20]:
def tokenize_sentences(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokenized = nltk.word_tokenize(sentence)
        tokenized_processed = [t.lower() for t in tokenized if not t.isupper()]
        if len(tokenized_processed) > 5:
            tokenized_sentences.append(tokenized_processed)
    return tokenized_sentences

In [21]:
def get_tokenized_data(data):
    return tokenize_sentences(split_to_sentences(data))

In [22]:
def count_words(tokenized_sentences):
    word_counts = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            word_counts[token] = word_counts.get(token, 0) + 1
    return word_counts

In [23]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)
    for word, cnt in word_counts.items():
        if cnt >= count_threshold:
            closed_vocab.append(word)
    return closed_vocab

In [24]:
def replace_oov_words_by_unk(tokenized_sentences, 
                             vocabulary, 
                             unknown_token="<unk>"):
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:
        replaced_sentence = []
        for token in sentence:
            if token in vocabulary:
                replaced_sentence.append(token)
            else:
                replaced_sentence.append(unknown_token)
        replaced_tokenized_sentences.append(replaced_sentence)
    return replaced_tokenized_sentences

In [32]:
def preprocess_data(data, 
                    count_threshold, 
                    unknown_token="<unk>", 
                    get_words_with_nplus_frequency=get_words_with_nplus_frequency, 
                    replace_oov_words_by_unk=replace_oov_words_by_unk):
    vocabulary = get_words_with_nplus_frequency(data, count_threshold)
    data_replaced = replace_oov_words_by_unk(data, vocabulary, unknown_token)
    return data_replaced, vocabulary

In [26]:
def count_n_grams(data, n, start_token="<s>", end_token="<e>"):
    n_grams = {}
    for sentence in data:
        sentence = [start_token] * n + sentence + [end_token]
        sentence = tuple(sentence)
        for i in range(len(sentence) - (n - 1)):
            n_gram = sentence[i:i+n]
            n_grams[n_gram] = n_grams.get(n_gram, 0) + 1
    return n_grams

In [27]:
def estimate_probability(word,
                         previous_n_gram,
                        n_gram_counts,
                        n_plus1_gram_counts,
                        vocabulary_size, 
                        k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
    denominator = previous_n_gram_count + k * vocabulary_size
    n_plus1_gram = previous_n_gram + (word, )
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)
    numerator = n_plus1_gram_count + k
    return numerator / denominator

In [28]:
def estimate_probabilities(previous_n_gram,
                          n_gram_counts,
                          n_plus1_gram_counts,
                          vocabulary,
                          end_token="<e>",
                           unknown_token="<unk>",
                           k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    vocabulary = vocabulary + [end_token, unknown_token]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probabilities[word] = estimate_probability(word,
                                                   previous_n_gram,
                                                   n_gram_counts, 
                                                   n_plus1_gram_counts, 
                                                   vocabulary_size, 
                                                   k=k)
    return probabilities

In [62]:
def suggest_a_word(previous_tokens,
                  n_gram_counts,
                  n_plus1_gram_counts,
                  vocabulary,
                  end_token="<e>",
                  unknown_token="<unk>",
                   k=1.0,
                   start_with=None):
    
    n = len(list(n_gram_counts.keys())[0])
    previous_tokens = ["<s>"] * n + previous_tokens
    previous_n_gram = previous_tokens[-n:]
    
    if tuple(previous_n_gram) not in n_gram_counts:
        return None
    
    probabilities = estimate_probabilities(previous_n_gram,
                                          n_gram_counts,
                                          n_plus1_gram_counts,
                                          vocabulary,
                                          k=k)
    
    del probabilities["<unk>"]
    
    probabilities_sorted = {k: v for k, v in sorted(probabilities.items(), key=lambda item: item[1], reverse=True)}
        
    prob_sorted_top = []
    max_prob = list(probabilities_sorted.items())[0][1]
    for item in list(probabilities_sorted.items()):
        if item[1] == max_prob:
            prob_sorted_top.append(item)
        
    words = [t[0] for t in prob_sorted_top]
    probs = [t[1] for t in prob_sorted_top]
    
    return random.choices(words, weights=probs)[0]

In [30]:
def get_suggestions(previous_tokens,
                   n_gram_counts_list,
                   vocabulary,
                    k=1.0,
                    start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts - 1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i + 1]
        
        suggestion = suggest_a_word(previous_tokens, 
                                    n_gram_counts,
                                    n_plus1_gram_counts, 
                                    vocabulary,
                                    k=k, 
                                    start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [33]:
tokenized_data = get_tokenized_data(data)
print(len(tokenized_data))

minimum_freq = 2
data_processed, vocabulary = preprocess_data(tokenized_data, minimum_freq)

17793


In [34]:
n_gram_counts_list = []
for n in range(1, 6):
    n_model_counts = count_n_grams(data_processed, n)
    n_gram_counts_list.append(n_model_counts)

In [82]:
text = "business"
previous_tokens = text.split(" ")
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['business'], the suggestions are:


['and', 'flow', 'flow', 'model']

In [87]:
N = 4
previous_tokens = []
output = ""
for i in range(100):
    
    candidates = get_suggestions(previous_tokens,
                                 n_gram_counts_list,
                                 vocabulary,
                                 k=1.0,
                                 start_with=None)
    
    for j in range(N):
        if candidates[N - j - 1] != None:
            next_word = candidates[N - j - 1]
            break
    
    previous_tokens.append(next_word)
    output += next_word + " "
    if (next_word == "<e>") or (i==99):
        print(output.replace("<e>", "."), end=" ")
        previous_tokens = []
        output = ""

we have also implemented a number of internal rules and policies to extend the lifecycle of our games ’ ’ .  we have also entered into collaboration and license agreements with 21,123 ) ( 6.4 ) ( 34,354 ) ( 3.5 ) ( ofthis document .  we have also implemented stringent control on our inventory level and the requirements of our customers .  we have also adopted treasury policies to manage our daily expenses and cash withdrawals , so as to ensure our working capital sufficiency by taking advantage of modern technologies to develop servicecategory .  we have also established  