In [1]:
from nltk.corpus import words
# Load English dictionary
english_dictionary = set(words.words())

In [2]:
# import required modules
from tkinter import *
import tkinter.font as font
from nltk import ngrams
import string
import numpy as np
import re
from nltk.tokenize import sent_tokenize
import wordninja
import pickle
from nltk import edit_distance
# import enchant
import eng_to_ipa as eng_to_ipa
from nltk.util import pad_sequence

In [3]:
# Load the saved unigram dictionary from the file
with open("unique_unigram_list.pickle", "rb") as file:
    unigram_list = pickle.load(file)
    
with open("bigram_prob_dict_final.pickle", "rb") as file:
    bigram_probabilities = pickle.load(file)
bigram_probabilities[('</s>', '<s>')] = 1
    
with open("unigram_ipa.pickle", "rb") as file:
    unigrams_ipa = pickle.load(file)

#Load the Counter object from the file
with open('unigrams_counter_final.pickle', 'rb') as file:
    words_counter = pickle.load(file)


unigrams_ipa_dict = {entry['token']: entry.get('ipa') for entry in unigrams_ipa}

In [7]:
clicked_word = ""

# re patterns
#remove url
url_pattern_1 =r'(http[s]?://)?[a-zA-Z0-9]+([-.][a-zA-Z0-9]+)\.[a-zA-Z]{2,3}(/\S)?'
url_pattern_2 = r'http\S+|www.\S+'

#remove '-\n' patterns
hyphen_nl_pattern = r'(-\n)'

#remove hyphens that are found in between a word
hyphen_pattern = r'(?<=[a-z])-(?=[a-z])'

#remove digits
digits = '\d+'

#remove \n
next_line = r'\n'

#remove any symbols
symbols = r'[^\w\s]'

#remove double spaces
double_space = r'\s{2}' 


def sent_preprocess(text):
    sentences = sent_tokenize(text)
    sentences_clean = []
    for sentence in sentences:
        cleaned_sentence = re.sub(url_pattern_1, '', sentence)
        cleaned_sentence = re.sub(url_pattern_2, '', cleaned_sentence)
        cleaned_sentence = re.sub(hyphen_nl_pattern, '', cleaned_sentence)
        cleaned_sentence = re.sub(hyphen_pattern, ' ', cleaned_sentence)
        cleaned_sentence = re.sub(next_line,' ', cleaned_sentence)
        cleaned_sentence = re.sub(digits,'', cleaned_sentence)
        cleaned_sentence = re.sub(symbols,'', cleaned_sentence)
        cleaned_sentence = re.sub(double_space,' ', cleaned_sentence)
        cleaned_sentence = cleaned_sentence.lower()
        sentences_clean.append(cleaned_sentence)
    return sentences_clean

def unnest_list(nested_list):
    """Unnest a list to convert nestted token in list form into unnested one
    
    nested_list = [1, [2, 3], [4, [5, 6]], 7, [8, [9, 10]]]
    
    Example:

    flat_list = unnest_list(nested_list)
    print(flat_list)

    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"""
    
    unnested = []
    for item in nested_list:
        if isinstance(item, list):
            unnested.extend(unnest_list(item))
        else:
            unnested.append(item)
    return unnested

def break_down(texts):
    """Start from preprocess the texts by removing noises such as punctuation and lower case it. Then tokenize the texts into
    individual words and return it in unnested list"""
    sentences = sent_preprocess(texts)
    
    token = []
    for sentence in sentences:
        words = sentence.split()
        padded = list(pad_sequence(words, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2))
        token.append(padded)
    
    flat_token = unnest_list(token)
    
    return(flat_token)

# function to obtain ipa with a word
def get_ipa(word):
    return unigrams_ipa_dict.get(word)

#generate probability of word occuring in the corpus
def P(word, N=sum(words_counter.values())): 
    # Returns probability of a word in vocabulary.
    return words_counter[word] / N

def initialize_dictionary_list():
    # Clear the previous content in the dictionary_list widget
    dictionary_list.delete(0, "end")

    # Display the dictionary list in the dictionary_list widget
    for word in unigram_list:
        word_entry = f"{word}\n"
        dictionary_list.insert(END, word_entry)
    
def combine_funcs(*funcs):
    """to combine multiple function inside one command when click on the button
    
    Example:
    
    combined_func(print(), list(), max())"""
    
    def combined_func(*args, **kwargs):
        for f in funcs:
            f(*args, **kwargs)
    return combined_func
    
def detect_non_word_errors():
    """To detect non-word error based on dictionary look up and highlight the text"""
    # Get the content of the text widget
    input_text = editor.get("1.0", "end")
    
    # set the dictionary of known words
    known_words = unigram_list
    
    # split the input text and preprocess it
    words = break_down(input_text)
    
    # check if the word is a non-word error
    for word in words:
        if word in known_words:
            continue
        elif word in english_dictionary:
            continue
        else:
            start_index = editor.search(word, "1.0", "end", exact=True, nocase=True)
            if start_index:
                end_index = f"{start_index}+{len(word)}c"
                editor.tag_add("non-word error", start_index, end_index)
            
def calculate_bigram_probability(bigram):
    return bigram_probabilities.get(bigram, 0.0)

def detect_real_word_errors():

    # Get the content of the text widget
    input_text = editor.get("1.0", "end")
    # set the dictionary of known words
    # known_words = unigram_list
    # split the input text and preprocess it
    window_size = 2
    min_bigram_probability = 0.00001
    words = break_down(input_text)
    #print(words)

    for i in range(1, len(words)):
        current_word = words[i]
        
        # Check bigram probability
        if i > 0:
            if current_word not in unigram_list:
                continue
            previous_word = words[i-1]
            if i < len(words)-1:
                next_word = words[i+1]
            else: pass
            bigram = list(ngrams([previous_word, current_word,next_word], window_size))
            bigram_probabilities = []
            for bg in bigram:
                bigram_probabilities.append(calculate_bigram_probability(bg))
                print(bigram,'-',bigram_probabilities)
            avg_probability = np.mean(bigram_probabilities)
            #print(avg_probability)
            if avg_probability >= min_bigram_probability:
                #print('skipped')
                continue
            else: print('WRONG')
        #Find the start and end index of the word
        start_index = editor.search(current_word, "1.0", "end", exact = True, nocase = True)
        #curr_word_pattern = r"\b" + re.escape(current_word) + r"\b"
        #start_index = editor.search(curr_word_pattern, "1.0", "end", regexp=True, exact = False)
        #print(start_index)
        while start_index:
            end_index = f"{start_index}+{len(current_word)}c"
        #print(end_index)

        # Add a tag to highlight the real word error
            editor.tag_add("real word error", start_index, end_index)
            start_index = editor.search(current_word, end_index, "end", exact=True, nocase=True)
            
def generate_word_candidates(event):
    global clicked_word
    
    # Get the index of the clicked word
    index = editor.index("@%s,%s wordstart" % (event.x, event.y))
    
    # Get the word at the clicked index
    clicked_word = editor.get(index + " wordstart", index + " wordend").lower()
    
    # Define corpus and bigram_probabilities
    corpus = unigram_list
    global bigram_probabilities
    
    # Find the index of the mispelled word in the sentence 
    sentence = editor.get("1.0", "end-1c")
    words = break_down(sentence)
    misspelled_index = words.index(clicked_word)
    
    candidates = []
    # Generate word candidates using edit distance and bigram probability 
    for word in corpus:
        if len(word) <= 7:
            if edit_distance(word, clicked_word, transpositions=True) <= 1:
                candidates.append(word)
        else:
            if edit_distance(word, clicked_word, transpositions=True) <= 2:
                candidates.append(word)
    
    probabilities = {}
    for candidate in candidates:
        # Get previous word for bigram probabilities computation
        previous_word = words[misspelled_index - 1]
        bigram_prob = calculate_bigram_probability((previous_word,candidate))
        probabilities[candidate] = bigram_prob
    
    sorted_dict = {k: v for k, v in sorted(probabilities.items(), key=lambda x: x[1],reverse=True)}
    probabilities_list = list(sorted_dict.keys())
    
    
    # Clear the existing text in the correction label widget
    suggested_candidates.delete(0, "end")

    # Generate the word candidates in the correction label widget
    for candidate in probabilities_list:
        suggested_candidates.insert("end", candidate + '\n')

def generate_real_word_candidates(event):
    global clicked_word
    
    # Get the index of the clicked word
    index = editor.index("@%s,%s wordstart" % (event.x, event.y))
    
    # Get the word at the clicked index
    clicked_word = editor.get(index + " wordstart", index + " wordend").lower()
    clicked_word_lower = clicked_word.lower()
    
    # Define corpus and bigram_probabilities
    corpus = unigram_list
    global bigram_probabilities
    
    # Find the index of the mispelled word in the sentence 
    sentence = editor.get("1.0", "end-1c")
    words = break_down(sentence)
    misspelled_index = words.index(clicked_word_lower)
    
    candidates = []
    word_ipa = eng_to_ipa.convert(eng_to_ipa.convert(clicked_word_lower))
    for word in corpus:
        ed = edit_distance(word, clicked_word_lower, transpositions=True)
        if len(word) <= 7:
            if ed <= 1:
                candidates.append(word)
            if ed <=4:
                w_ipa = get_ipa(word)
                if w_ipa is not None:
                    ed_ipa = edit_distance(w_ipa, word_ipa)
                    if ed_ipa ==0:
                        candidates.append(word)    
        else:
            if ed <= 2:
                candidates.append(word)
                            
    candidates = list(set(candidates))
    
    probabilities = {}
    for candidate in candidates:
        # Get previous word for bigram probabilities computation
        previous_word = words[misspelled_index - 1]
        next_word = words[misspelled_index + 1]
        prob = P(candidate)*calculate_bigram_probability((previous_word,candidate))*calculate_bigram_probability((candidate,next_word))
        probabilities[candidate] = prob
                
    sorted_dict = {k: v for k, v in sorted(probabilities.items(), key=lambda x: x[1],reverse=True)}
    probabilities_list = list(sorted_dict.keys())
      
    # Clear the existing text in the correction label widget
    suggested_candidates.delete(0, "end")
    
    for candidate in probabilities_list:
        suggested_candidates.insert("end", candidate + "\n")
        
def replace_word_with_candidate(event):
    global clicked_word
    
    candidate_index = suggested_candidates.curselection()
    
    if candidate_index:
        clicked_candidate = suggested_candidates.get(candidate_index)
        clicked_candidate = clicked_candidate.strip()
        
        sentence = editor.get("1.0", "end")
        
        updated_text = sentence.replace(clicked_word, clicked_candidate)
    
        # Clear the editor and insert the updated text
        editor.delete("1.0", "end")
        editor.insert("end", updated_text)
        
def dictionary_search():
    search_word = search_text.get(1.0, "end").strip().lower()
    
    if search_word:
        found = False
        dictionary_list.selection_clear(0, 'end')
        for index in range(dictionary_list.size()):
            word = dictionary_list.get(index)
            if search_word == word.lower().strip():
                dictionary_list.selection_set(index)
                dictionary_list.see(index)
                dictionary_list.activate(index)
                dictionary_list.itemconfig(index, bg = 'yellow')
                found = True
            else:
                dictionary_list.itemconfig(index, bg = 'white')
                                  
wn = Tk()
wn.geometry("1000x1000")
wn.configure(bg='azure2')
wn.title("Spell Checker")

searchWord = StringVar()

headingFrame1 = Frame(wn, bg="gray91", bd=5)
headingFrame1.place(relx=0.05, rely=0.01, relwidth=0.9, relheight=0.1)

headingLabel = Label(headingFrame1, text="Automatic Spelling Correction System", fg='grey19', font=('Courier', 20, 'bold'))
headingLabel.place(relx=0, rely=0, relwidth=1, relheight=1)

# Legend
Label(wn, text='Legend: Red colour = Non-word error', bg='azure2', font=('Italian', 12)).place(x=20, y=130)
Label(wn, text='Yellow colour = Real-word error', bg='azure2', font=('Italian', 12)).place(x=82, y=170)

# Editor section
Label(wn, text='Please input sentence', bg='azure2', font=('Courier', 12)).place(x=20, y=220)

editor = Text(wn, height=12, width=106, font=('calibre', 12, 'normal'))
editor.place(x=20, y=250)

# Configure a tag for non-word and real word errors
editor.tag_configure("non-word error", background="red")
editor.tag_configure("real word error", background = "yellow")

# Bind the function to the wrong word click event
editor.tag_bind("non-word error", "<Button-1>", generate_word_candidates)
editor.tag_bind("real word error", "<Button-1>", generate_real_word_candidates)

# Dictionary List section
dictionary_label = Label(wn, text='Dictionary List:', bg='azure2', font=('Courier', 12))
dictionary_label.place(x=20, y=480)

dictionary_list = Listbox(wn, height=7, width=50, font=('calibre', 12, 'normal'))
dictionary_list.place(x=20, y=500)

# Initialize the dictionary list when run GUI 
initialize_dictionary_list()

# Search Section
search_label = Label(wn, text = 'Search:', bg = 'azure2', font = ('Courier', 12))
search_label.place(x = 20, y = 650)

search_text = Text(wn, height = 2, width = 20, font = ('calibre', 12, 'normal'))
search_text.place(x = 20, y = 675)

# Search Button
search_button = Button(wn, text = 'Search', bg='honeydew2', fg='black', width=15, height=1, command = dictionary_search)
search_button['font'] = font.Font(size = 12)
search_button.place(x = 20, y = 725)

# Check button
check_button = Button(wn, text='Check', bg='honeydew2', fg='black', width=15, height=1, 
                      command=combine_funcs(detect_non_word_errors, detect_real_word_errors))
check_button['font'] = font.Font(size=14)
check_button.place(x=780, y=525)

# Section to display corrected words
suggested_candidates_label = Label(wn, text='Candidate Words:', bg='azure2', font=('Courier', 12))
suggested_candidates_label.place(x=500, y=480)

suggested_candidates = Listbox(wn, height=7, width=25, font=('calibre', 12, 'normal'))
suggested_candidates.place(x=500, y=500)
suggested_candidates.bind("<<ListboxSelect>>", replace_word_with_candidate)

wn.mainloop()

[('<s>', 'gemini'), ('gemini', 'astrolgical')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrolgical')] - [0.00629, 0.0]
[('astrolgical', 'symbol'), ('symbol', 'should')] - [0.0]
[('astrolgical', 'symbol'), ('symbol', 'should')] - [0.0, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>')] - [0.11111]
[('question', 'mark'), ('mark', '</s>')] - [0.11111, 0.05263]
[('youre', 'a'), ('a', 'perpetual')] - [0.112]
[('youre', 'a'), ('a', 'pe

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>

[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629]
[('<s>', 'gemini'), ('gemini', 'astrological')] - [0.00629, 0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0]
[('gemini', 'astrological'), ('astrological', 'symbol')] - [0.0, 0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463]
[('astrological', 'symbol'), ('symbol', 'should')] - [0.07463, 0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639]
[('symbol', 'should'), ('should', 'really')] - [0.01639, 0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685]
[('should', 'really'), ('really', 'be')] - [0.00685, 0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333]
[('really', 'be'), ('be', 'the')] - [0.03333, 0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419]
[('be', 'the'), ('the', 'question')] - [0.0419, 0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068]
[('the', 'question'), ('question', 'mark')] - [0.00068, 0.11111]
[('question', 'mark'), ('mark', '</s>