# Wordle Bot

A Natural Language Processing Bot to solve Wordle puzzle.

**References**:
1.  https://www.pythonprogramming.in/find-frequency-of-each-word-from-a-text-file-using-nltk.html
2. https://mathspp.com/blog/solving-wordle-with-python


# Import NLTK Corpus

We are using Webtext and ABC corpus of Natural Language Toolkit (NTLK).

In [1]:
import nltk
from nltk.corpus import webtext, abc
from nltk.probability import FreqDist

nltk.download('webtext')
nltk.download('abc')

[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package abc to /root/nltk_data...
[nltk_data]   Package abc is already up-to-date!


True

# Utility Functions
1. SameLetterInWord() - Check whether there is same letter in the word
2. FindHighestFreqWord() - Find the highest frequency word in a list


In [2]:
# Check whether there is same letter in the word
def SameLetterInWord(word):
  ret = False
  for i in range(len(word)):
    for j in range(len(word)):
      if word[i] == word[j] and i != j:
        ret = True
        break
    if ret == True:
      break

  return ret

In [3]:
def FindHighestFreqWord(words_list):
  #wt_words = webtext.words('wine.txt')
  wt_words = webtext.words() + abc.words()
  #words = 'fruit with good wine this very' #['fruit', 'with', 'good', 'wine', 'this', 'very']
  #wt_words = nltk.tokenize.word_tokenize(words)
  data_analysis = nltk.FreqDist(wt_words)

  # Let's take the specific words only if their frequency is greater than 3.
  #filter_words = dict([(m, n) for m, n in data_analysis.items() if len(m) > 3])
  filter_words = dict([(m, n) for m, n in data_analysis.items() ])

  #print('filter_words len=', len(filter_words))

  highestFreqWord = ''
  highestFreq = 0
  for word in words_list:
    for key in sorted(filter_words):
      if word == key and highestFreq < filter_words[key]: # and SameLetterInWord(word) == False:
        highestFreqWord = key
        highestFreq = filter_words[key]

  return highestFreqWord, highestFreq

# Bot Main Body
Taken from a blog 'Solving Wordle puzzles with Python' (https://mathspp.com/blog/solving-wordle-with-python)

In [4]:
"""
Solving Wordle puzzles with Python.
See https://mathspp.com/blog/solving-wordle-with-python for an article on this.
"""

import collections
import enum
import random


class Tip(enum.Enum):
    ABSENT = 0
    PRESENT = 1
    CORRECT = 2


def score(secret, guess):
    """Scores a guess word when compared to a secret word.

    Makes sure that characters aren't over-counted when they are correct.
    For example, a careless implementation would flag the first “s”
    of “swiss” as PRESENT if the secret word were “chess”.
    However, the first “s” must be flagged as ABSENT.

    To account for this, we start by computing a pool of all the relevant characters
    and then make sure to remove them as they get used.
    """

    # All characters that are not correct go into the usable pool.
    pool = collections.Counter(s for s, g in zip(secret, guess) if s != g)
    # Create a first tentative score by comparing char by char.
    score = []
    for secret_char, guess_char in zip(secret, guess):
        if secret_char == guess_char:
            score.append(Tip.CORRECT)
        elif guess_char in secret and pool[guess_char] > 0:
            score.append(Tip.PRESENT)
            pool[guess_char] -= 1
        else:
            score.append(Tip.ABSENT)

    return score


def filter_words(words, guess, score):
    """Filter words to only keep those that respect the score for the given guess."""

    #i = 0 #Debug
    new_words = []
    for word in words:
        # The pool of characters that account for the PRESENT ones is all the characters
        # that do not correspond to CORRECT positions.
        pool = collections.Counter(c for c, sc in zip(word, score) if sc != Tip.CORRECT)
        #if i < 5:
          #print('DEBUG: pool', pool, ' word: ', word, ' score: ', score)
          #i += 1

        for char_w, char_g, sc in zip(word, guess, score):
            if sc == Tip.CORRECT and char_w != char_g:
                break  # Word doesn't have the CORRECT character.
            elif char_w == char_g and sc != Tip.CORRECT:
                break  # If the guess isn't CORRECT, no point in having equal chars.
            elif sc == Tip.PRESENT:
                if not pool[char_g]:
                    break  # Word doesn't have this PRESENT character.
                pool[char_g] -= 1
            elif sc == Tip.ABSENT and pool[char_g]:
                break  # ABSENT character shouldn't be here.
        else: # for-else => https://stackoverflow.com/questions/9979970/why-does-python-use-else-after-for-and-while-loops
            new_words.append(word)  # No `break` was hit, so store the word.

    return new_words


def get_random_word(words):
    print(f"I'll guess randomly from my pool of {len(words)} words...")
    sample = ", ".join(words[:8])
    end = ", among others..." if len(words) > 8 else "."
    print(f"I'm considering {sample}{end}")
    guess = random.choice(words)
    print(f"Hmmm, I'll guess {guess!r}...")
    return guess


def get_user_choice_word(words, first_guess):
    print(f"I'll guess randomly from my pool of {len(words)} words...")
    sample = ", ".join(words[:10])
    end = ", among others..." if len(words) > 10 else "."
    print(f"You have {sample}{end}")

    if len(words) > 10 and first_guess == False:
      #guess = random.choice(words)
      #print(f"Hmmm, I'll guess {guess!r}...")
      highestFreqWord, highestFreq = FindHighestFreqWord(words)
      if highestFreqWord != '':
        print(f"Hmmm, I'll guess \'{highestFreqWord}\' with frequency of {highestFreq!r}...")
        guess = highestFreqWord
      else:
        while True:
          guess = input('Which word will you choose? (Please type out the word): ').lower()
          if guess in words:
            break
          else:
            print("You have not key in the word in the recommended list. Please choose one word from the recommended list.")
    else:
      if first_guess == False:
        highestFreqWord, highestFreq = FindHighestFreqWord(words)
        if highestFreqWord != '':
          print(f"Hmmm, the highest frequency word is \'{highestFreqWord}\' with frequency of {highestFreq!r}...")

      while True:
        guess = input('Which word will you choose? (Please type out the word): ').lower()
        if guess in words:
          break
        else:
          print("You have not key in the word in the recommended list. Please choose one word from the recommended list.")

    return guess

def get_user_choice_word2(words, first_guess):
    print(f"I'll guess randomly from my pool of {len(words)} words...")
    sample = ", ".join(words[:10])
    end = ", among others..." if len(words) > 10 else "."
    print(f"You have {sample}{end}")

    if first_guess == False:
      highestFreqWord, highestFreq = FindHighestFreqWord(words)
      if highestFreqWord != '':
        print(f"Hmmm, the highest frequency word is \'{highestFreqWord}\' with frequency of {highestFreq!r}...")

    while True:
      guess = input('Which word will you choose? (Please type out the word): ').lower()
      if guess in words:
        break
      else:
        print("You have not key in the word in the recommended list. Please choose one word from the recommended list.")

    return guess


def play_against_computer(words):
    print("Write your secret word:")
    secret = input(">>> ")

    words = [word for word in words if len(word) == len(secret)]
    while len(words) > 1:
        guess = get_random_word(words)
        sc = score(secret, guess)
        print(f"\tMy guess scored {sc}...")
        words = filter_words(words, guess, sc)
        print()

    return words


def play_with_computer(words):
    #print("What's the length of the secret word?")
    #length = int(input(">>> "))
    length = 5
    words = [word for word in words if len(word) == length]

    mapping = {"0": Tip.ABSENT, "1": Tip.PRESENT, "2": Tip.CORRECT}
    print(f"\nNOTE: when typing scores, use {mapping}.\n")

    # Derived from bottom cells, see section "Find the highest frequencies letters and form possible start words"
    print("Ideal start words: aloes, arise, arles, arose, aster, earls, lares, laser, lears, raise, rales, rates, reals, serai, seral, stare, stoae, tares, tears, toeas\n")

    first_guess = True
    while len(words) > 1:
        #guess = get_random_word(words)
        guess = get_user_choice_word2(words, first_guess)
        first_guess = False
        print("How did this guess score?")

        while True:
          user_input = input(">>> ")
          if user_input.isnumeric() == True and int(user_input) >= 0 and int(user_input) <= 22222:
            break
          else:
            print(f"\nPlease key in the scores, use {mapping}.\n")
            print("How did this guess score?")

        sc = [mapping[char] for char in user_input if char in mapping]
        words = filter_words(words, guess, sc)
        print()

    return words

# Mount and Link to Google Drive

Ensure "WORD.LST" is in "My Drive/Wordle" folder.  This "WORD.LST" is taken from the blog author Github (https://github.com/rodrigogiraoserrao/projects/blob/master/misc/WORD.LST)

In [5]:
from google.colab import drive
drive.mount('/content/GDrive')

Drive already mounted at /content/GDrive; to attempt to forcibly remount, call drive.mount("/content/GDrive", force_remount=True).


Copy the "WORD.LST" to current Colab working drive for easy access.

In [6]:
!cp /content/GDrive/My\ Drive/Wordle/WORD.LST .

# Running Wordle Bot

In [7]:
if __name__ == "__main__":

    while True:
      WORD_LST = "WORD.LST"  # Point to a file with a word per line.

      with open(WORD_LST, "r") as f:
          words = [word.strip() for word in f.readlines()]

      # Commented away so that directly go to play with computer mode
      #print("Do you want [h]elp, or do you want to play [a]gainst the computer?")
      #option = input("[h/a] >>> ")
      #if option.strip().lower() == "h":
      #    words = play_with_computer(words)
      #else:
      #    words = play_against_computer(words)
      words = play_with_computer(words)

      if not words:
        print("I don't know any words that could solve the puzzle...")
        print("You may have key in the score or word wronly...\nRestarting the game")
      else:
        break

    print(f"The secret word is {words[0]!r}!")
    print()
    wordle_word = words[0]


NOTE: when typing scores, use {'0': <Tip.ABSENT: 0>, '1': <Tip.PRESENT: 1>, '2': <Tip.CORRECT: 2>}.

Ideal start words: aloes, arise, arles, arose, aster, earls, lares, laser, lears, raise, rales, rates, reals, serai, seral, stare, stoae, tares, tears, toeas

I'll guess randomly from my pool of 8672 words...
You have aahed, aalii, aargh, abaca, abaci, aback, abaft, abaka, abamp, abase, among others...
Which word will you choose? (Please type out the word): arise
How did this guess score?
>>> 02100

I'll guess randomly from my pool of 16 words...
You have broil, bruin, bruit, croci, droid, droit, druid, fruit, groin, iroko, among others...
Hmmm, the highest frequency word is 'fruit' with frequency of 615...
Which word will you choose? (Please type out the word): fruit
How did this guess score?
>>> 02022

I'll guess randomly from my pool of 2 words...
You have droit, orbit.
Hmmm, the highest frequency word is 'orbit' with frequency of 77...
Which word will you choose? (Please type out t

# The meaning of Wordle Word

If you have solved the Wordle puzzle, please run this cell to find out the meaning of this word from www.vocabulary.com

In [8]:
from IPython.display import IFrame

# Taken from https://stackoverflow.com/questions/51576756/display-render-an-html-file-inside-jupyter-notebook-on-google-colab-platform
url = "https://www.vocabulary.com/dictionary/" + wordle_word + ""
print(url)
IFrame(src=url, width=900, height=600)

https://www.vocabulary.com/dictionary/orbit


# Find the highest frequencies letters and form possible start words

Below is some functions to help to find the best start words for Wordle by finding:
1. The alphabets which has highest frequency in 'WORD.LST':

* s = 4526 count
* e = 4475 count
* a = 3864 count
* o = 2891 count
* r = 2844 count
* i = 2552 count
* l = 2378 count
* t = 2271 count


2. The 5 letter words which has the above alphabets.  Hence, possible start words:
* arose
* arise
* stare
* store
* lears
* slate
* riots






In [9]:
WORD_LST = "WORD.LST"  # Point to a file with a word per line.

alphabet_count = {'a':0, 'b':0, 'c':0, 'd':0, 'e':0, 'f':0, 'g':0, 'h':0, 'i':0, 'j':0, 'k':0, 'l':0, 'm':0, 'n':0, 'o':0, 'p':0, 'q':0, 'r':0, 's':0, 't':0, 'u':0, 'v':0, 'w':0, 'x':0, 'y':0, 'z':0 }

# Get the possible model answers
with open(WORD_LST, "r") as f:
    words = [word.strip() for word in f.readlines()]

for each_word in words:
  if len(each_word) == 5: # Only consider words that is length of 5 letters
    letters_each_word = list(each_word)
    for letter in letters_each_word:
      alphabet_count[letter] += 1

# sort the highest frequest letter
my_sort = sorted(alphabet_count.items(), key=lambda item: item[1], reverse=True)
print(my_sort)

[('s', 4526), ('e', 4475), ('a', 3864), ('o', 2891), ('r', 2844), ('i', 2552), ('l', 2378), ('t', 2271), ('n', 1968), ('d', 1685), ('u', 1637), ('c', 1449), ('p', 1361), ('y', 1352), ('m', 1299), ('h', 1178), ('g', 1067), ('b', 1039), ('k', 921), ('f', 770), ('w', 682), ('v', 466), ('z', 233), ('x', 203), ('j', 174), ('q', 75)]


In [10]:
start_words = []
for each_word in words:
  if len(each_word) == 5 and SameLetterInWord(each_word) == False:
    if my_sort[0][0] in each_word and my_sort[1][0] in each_word and my_sort[2][0] in each_word:
      if my_sort[3][0] in each_word:

        if my_sort[4][0] in each_word:
          start_words.append(each_word)

        if my_sort[5][0] in each_word:
          start_words.append(each_word)

        if my_sort[6][0] in each_word:
          start_words.append(each_word)

        if my_sort[7][0] in each_word:
          start_words.append(each_word)

      if my_sort[4][0] in each_word:

        if my_sort[5][0] in each_word:
          start_words.append(each_word)

        if my_sort[6][0] in each_word:
          start_words.append(each_word)

        if my_sort[7][0] in each_word:
          start_words.append(each_word)


print(start_words)


['aloes', 'arise', 'arles', 'arose', 'aster', 'earls', 'lares', 'laser', 'lears', 'raise', 'rales', 'rates', 'reals', 'serai', 'seral', 'stare', 'stoae', 'tares', 'tears', 'toeas']
