In [1]:
from collections import defaultdict
import pandas as pd

In [2]:
train = pd.read_csv('../data/train.csv', header=None, names=["text"])
test = pd.read_csv('../data/answers.csv', header=None, names=["text"])

In [3]:
def update_ngram_freq_table(ngram_freq_table, text, n):
    for i in range(len(text) - n + 1):
        ngram = text[i:i + n]
        ngram_freq_table[ngram] += 1

def build_frequency_tables(texts):
    bigram_freq_table = defaultdict(int)
    trigram_freq_table = defaultdict(int)

    for text in texts:
        update_ngram_freq_table(bigram_freq_table, text, 2)
        update_ngram_freq_table(trigram_freq_table, text, 3)

    return bigram_freq_table, trigram_freq_table



In [14]:
bigram_freq_table, trigram_freq_table = build_frequency_tables(train["text"])
trigram_freq_table

defaultdict(int,
            {'SR2': 116,
             'R2S': 129,
             '2SR': 145,
             '2SG': 44,
             'SGR': 96,
             'SRG': 74,
             'RGS': 29,
             'GSR': 94,
             'SRS': 595,
             'RSR': 365,
             'KFK': 26,
             'FKF': 24,
             'X2L': 8,
             '2LS': 8,
             'LS2': 7,
             'S2F': 16,
             '2FR': 19,
             'FRS': 13,
             'RS2': 114,
             'MMM': 478,
             'MMG': 49,
             'MGG': 49,
             'GGG': 282,
             '222': 573,
             'LLL': 84,
             'LLK': 17,
             'LK2': 11,
             'K2Z': 43,
             '2ZT': 155,
             'ZT2': 214,
             'SG1': 19,
             'G1S': 26,
             '1SR': 104,
             'RSG': 147,
             '2M2': 107,
             'M2M': 83,
             '2MM': 94,
             'MMS': 133,
             'MSG': 89,
             'T22': 11,
           

In [24]:
def predict_next_char(bigram_freq_table, trigram_freq_table, previous_chars):
    # Initialize variables for bigram and trigram predictions
    predicted_char_from_bigram = None
    predicted_char_from_trigram = None
    max_bigram_freq = 0
    max_trigram_freq = 0

    # Get the last two and three characters
    bigram = previous_chars[-2:]
    trigram = previous_chars[-3:]

    # Check for the most frequent next character in bigrams
    for next_char, freq in bigram_freq_table.items():
        if next_char.startswith(bigram) and freq > max_bigram_freq:
            max_bigram_freq = freq
            predicted_char_from_bigram = next_char[-1]

    # Check for the most frequent next character in trigrams
    for next_char, freq in trigram_freq_table.items():
        if next_char.startswith(trigram) and freq > max_trigram_freq:
            max_trigram_freq = freq
            predicted_char_from_trigram = next_char[-1]

    # Choose the prediction with the higher frequency
    if max_trigram_freq > max_bigram_freq:
        return predicted_char_from_trigram
    else:
        return predicted_char_from_bigram


In [31]:
previous_chars = "11BC"
predicted_char = predict_next_char(bigram_freq_table, trigram_freq_table, previous_chars)
print(f"Predicted next character: {predicted_char}")


Predicted next character: None
