In [13]:
from collections import Counter
from nltk import bigrams, trigrams
from tabulate import tabulate
import re

In [14]:
# https://git.wmi.amu.edu.pl/ryssta/moj-2024-ns-cw/src/branch/main/challenging_america_50k_texts.zip
file_name = './data/challenging_america_50k_texts.txt'
with open(file_name, 'r') as file:
    file_contents = file.read()

text = re.sub(r'[^a-zA-z\s]+', '', file_contents)
text = text.strip('\t\r\n')
text = text.lower()
word_list = text.split()

In [15]:
unigram_counter = Counter(word_list)
most_common_15000 = unigram_counter.most_common(15000)
most_common_15000_words = [x[0] for x in most_common_15000]
print(most_common_15000[:10])

[('the', 1045827), ('of', 630648), ('and', 496188), ('to', 410609), ('a', 309302), ('in', 309085), ('that', 168335), ('is', 146755), ('it', 133838), ('for', 131568)]


In [16]:
most_common_15000_words_set = set(most_common_15000_words)
unknown_word_token = '[UNK]'

def parseWord(word: str):
    return unknown_word_token if word not in most_common_15000_words_set else word

parsed_word_list = [parseWord(word) for word in word_list]

In [17]:
bigram_counter = Counter(list(bigrams(parsed_word_list)))
bigrams_most_common = bigram_counter.most_common()
print(bigrams_most_common[:10])

[(('[UNK]', '[UNK]'), 213550), (('of', 'the'), 173395), (('the', '[UNK]'), 140355), (('[UNK]', 'and'), 84323), (('[UNK]', 'of'), 81018), (('in', 'the'), 79826), (('to', 'the'), 61877), (('of', '[UNK]'), 59336), (('and', '[UNK]'), 58576), (('[UNK]', 'the'), 57570)]


In [18]:
trigram_counter = Counter(list(trigrams(parsed_word_list)))
trigrams_most_common = trigram_counter.most_common()
print(trigrams_most_common[:10])

[(('[UNK]', '[UNK]', '[UNK]'), 43910), (('of', 'the', '[UNK]'), 22662), (('the', '[UNK]', 'of'), 20887), (('[UNK]', 'of', 'the'), 20796), (('[UNK]', 'and', '[UNK]'), 17426), (('the', '[UNK]', '[UNK]'), 16789), (('[UNK]', 'of', '[UNK]'), 11473), (('[UNK]', 'the', '[UNK]'), 11435), (('[UNK]', '[UNK]', 'and'), 11348), (('[UNK]', 'in', 'the'), 9253)]


In [19]:
def find_matching_trigrams(word1, word2):
    return [tup for tup in trigrams_most_common if tup[0][0] == word1 and tup[0][1] == word2 and tup[0][2] != unknown_word_token]

In [20]:
def predict_next_word(word1: str, word2: str):
    parsed_word_1 = parseWord(word1)
    parsed_word_2 = parseWord(word2)
    words_bigram_count = bigram_counter.get((parsed_word_1, parsed_word_2))
    predictions = find_matching_trigrams(parsed_word_1, parsed_word_2)[:5]
    result = [(i+1, tup[0][2], tup[1]/words_bigram_count) for i, tup in enumerate(predictions)]
    return result


test_list = [
    "this year",
    "the mr",
    "they have",
    "the best",
    "a few"
]

for phrase in test_list:
    words = phrase.split()
    prediction_list = predict_next_word(words[0], words[1])
    print(f'Phrase: {phrase}')
    print(tabulate(prediction_list, headers=["Index", "Predicted word", "Probability"], tablefmt="grid"), '\n')


Phrase: this year
+---------+------------------+---------------+
|   Index | Predicted word   |   Probability |
|       1 | the              |     0.0914286 |
+---------+------------------+---------------+
|       2 | and              |     0.0742857 |
+---------+------------------+---------------+
|       3 | in               |     0.04      |
+---------+------------------+---------------+
|       4 | will             |     0.0352381 |
+---------+------------------+---------------+
|       5 | is               |     0.0295238 |
+---------+------------------+---------------+ 

Phrase: the mr
+---------+------------------+---------------+
|   Index | Predicted word   |   Probability |
|       1 | i                |     0.0833333 |
+---------+------------------+---------------+
|       2 | wil              |     0.0833333 |
+---------+------------------+---------------+
|       3 | wilsons          |     0.0833333 |
+---------+------------------+---------------+
|       4 | huntington   

In [21]:
def calculate_sentence_probability(sentence: str):
    words = sentence.split()
    steps = []
    probability = 1.0

    for index, word in enumerate(words):
        if index == 0:
            word_occurences = unigram_counter.get(word) or 0
            total_word_count = len(word_list)
            x = word_occurences / total_word_count
            probability *= x
        elif index == 1:
            bigram_occurences = bigram_counter.get(tuple(words[:2])) or 0
            first_word_occurences: int = unigram_counter.get(words[0])
            x = bigram_occurences / first_word_occurences
            probability *= x
        else:
            trigram_occurences = trigram_counter.get(tuple(words[index-2:index+1])) or 0
            bigram_occurences: int = bigram_counter.get(tuple(words[index-2:index]))
            x = trigram_occurences / bigram_occurences
            probability *= x
            
        steps.append([index, ' '.join(words[:index+1]), x, probability])
        if probability == 0:
            return 0, steps

    return probability, steps

test_list = ['it has been a', 'it been has a']

for sentence in test_list:
    probability, steps = calculate_sentence_probability(sentence)
    print(f'Probability of sentence "{sentence}": {probability}')
    print(tabulate(steps, headers=["Step", "Sentence", "New word probability", "Total probability"], tablefmt="grid"),'\n')

Probability of sentence "it has been a": 3.035875468077675e-06
+--------+---------------+------------------------+---------------------+
|   Step | Sentence      |   New word probability |   Total probability |
|      0 | it            |             0.00845288 |         0.00845288  |
+--------+---------------+------------------------+---------------------+
|      1 | it has        |             0.0254113  |         0.000214799 |
+--------+---------------+------------------------+---------------------+
|      2 | it has been   |             0.342546   |         7.35786e-05 |
+--------+---------------+------------------------+---------------------+
|      3 | it has been a |             0.0412603  |         3.03588e-06 |
+--------+---------------+------------------------+---------------------+ 

Probability of sentence "it been has a": 0
+--------+-------------+------------------------+---------------------+
|   Step | Sentence    |   New word probability |   Total probability |
|      0

In [22]:
def complete_sentence(sentence: str, length: int = 15):
    words = sentence.split()
    if (len(words) < 2):
        raise ValueError("sentence string argument must contain at least three words")
    
    parsed_word_1 = parseWord(words[-2])
    parsed_word_2 = parseWord(words[-1])

    most_likely_trigram = find_matching_trigrams(parsed_word_1, parsed_word_2)[0]
    generated_word: str = most_likely_trigram[0][2]
    new_sentence = sentence + ' ' + generated_word

    if len(new_sentence.split()) == length:
        return new_sentence
    else:
        return complete_sentence(new_sentence, length)
    

In [23]:
test_list =[
    "it took",
    "because there",
    "actually it",
    "in my",
    "after a",
]

for sentence in test_list:
    completion = complete_sentence(sentence)
    print(completion)


it took a long time and place of beginning containing acres more or less to
because there is no doubt that the said mortgage and the other hand the following
actually it exercises its extreme authority to make a good deal of money and the
in my opinion is that the said mortgage and the other hand the following described
after a few days ago and the other hand the following described real estate and
