In [70]:
import json
import string
import math
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict
from collections import Counter, defaultdict


def normalize(df: pd.DataFrame) -> pd.DataFrame:
    return (df - df.min()) / (df.max() - df.min())

def sort(df: pd.DataFrame, column: str, reverse=True) -> pd.DataFrame:
    return df.sort_values(by=[column], ascending=not reverse)


class WordProcessor:
    
    def __init__(self, alphabet=string.ascii_lowercase):
        self.alphabet = alphabet
        
    def score_letter_frequencies(self, answer_list: List[str], word_list: List[str]) -> Counter:
        self.frequencies = sum((Counter(word) for word in word_list), Counter())
        return self.frequencies
    
    def score_letter_positions(self, answer_list: List[str]) -> Dict[int, Dict[str, int]]:
        self.position_scores = {i: {letter: 0 for letter in self.alphabet} for i in range(len(answer_list[0]))}
        for word in answer_list:
            for i, letter in enumerate(word):
                self.position_scores[i][letter] += 1
        return self.position_scores
    
    def frequency(self, word: str, reduce_function=sum) -> int:
        return reduce_function(self.frequencies[letter] for letter in set(word))
        
    def position(self, word: str, reduce_function=sum) -> int:
        return reduce_function(self.position_scores[i][letter] for i, letter in enumerate(word))
    
    def entropy(self, word: str, word_list: List[str]) -> int:
        arr = np.zeros(tuple((3 for _ in range(len(word)))))
        for other in word_list:
            arr[tuple(self.compare_words(word, other))] += 1
        arr = arr[arr != 0]
        return np.sum(arr * np.log2(arr)) / np.sum(arr)
        
    def score_words(self, words: List[str], scoring_function, *args) -> Dict[str, int]:
        return {word: scoring_function(word, *args) for word in words}
    
    def compare_words(self, word1: str, word2: str) -> List[int]:
        counts = Counter(word2)
        indexes = [0 for _ in range(len(word1))]
        for i, letter in enumerate(word1):
            if word2[i] == letter:
                indexes[i] = 2
                counts[letter] -= 1
        for i, letter in enumerate(word1):
            if counts[letter] > 0 and not indexes[i]:
                indexes[i] = 1
                counts[letter] -= 1
        
        return indexes


class Wordle:
    
    def __init__(self, answer_list: List[str], word_list: List[str], word_processor=WordProcessor()):
        
        self.answer_list = answer_list
        self.word_list = word_list
        self.processor = word_processor
        
        self.data = pd.DataFrame(index=self.word_list, columns=["answer?", "entropy", "position", "frequency"])
        
        self.calculate_scores(self.word_list)
        self.cache_data()
            
    def calculate_scores(self, words: List[str]):
        answer_column = {word: 0 for word in self.word_list}
        for word in self.answer_list:
            answer_column[word] = 1
        
        self.processor.score_letter_frequencies(self.answer_list, self.word_list)
        self.processor.score_letter_positions(self.answer_list)
        
        self.data["answer?"] = pd.Series(answer_column)
        self.data["frequency"] = pd.Series(self.processor.score_words(words, self.processor.frequency))
        self.data["position"] = pd.Series(self.processor.score_words(words, self.processor.position))
        self.data["entropy"] = pd.Series(self.processor.score_words(words, self.processor.entropy, self.answer_list))
    
    def filter_list(self, words: List[str], guess: str, results: list) -> List[str]:
        return [word for word in words if self.processor.compare_words(guess, word) == results]
    
    def filter_words(self, guess: str, results: list):
        self.answer_list = self.filter_list(self.answer_list, guess, results)
        self.word_list = self.filter_list(self.word_list, guess, results)
            
    def next_round(self, guess: str, results: list):
        self.filter_words(guess, results)
        self.calculate_scores(self.word_list)
        self.data = self.data.dropna(thresh=3)
        
    def cache_data(self):
        self.data_cache = self.data.copy()
        self.frequency_cache = self.processor.frequencies
        self.position_cache = self.processor.position_scores
        self.word_list_cache = self.word_list
        self.answer_list_cache = self.answer_list
        
    def reset(self):
        self.data = self.data_cache
        self.processor.frequencies = self.frequency_cache
        self.processor.position_scores = self.position_cache
        self.word_list = self.word_list_cache
        self.answer_list = self.answer_list_cache
    


In [71]:
with open("wordList.json") as file:
    words = json.load(file)

# this takes a long time because entropy is slow to compute
wordle = Wordle(answer_list=words["wordList1"], word_list=[*words["wordList1"], *words["wordList2"]])

# cache results so we can easily reset without recalculating entropy
wordle.cache_data()

       answer? entropy position frequency
cigar        1     NaN      NaN       NaN
rebut        1     NaN      NaN       NaN
sissy        1     NaN      NaN       NaN
humph        1     NaN      NaN       NaN
awake        1     NaN      NaN       NaN
...        ...     ...      ...       ...
zuzim        0     NaN      NaN       NaN
zygal        0     NaN      NaN       NaN
zygon        0     NaN      NaN       NaN
zymes        0     NaN      NaN       NaN
zymic        0     NaN      NaN       NaN

[12972 rows x 4 columns]
       answer? entropy position  frequency
cigar        1     NaN      NaN      17579
rebut        1     NaN      NaN      18253
sissy        1     NaN      NaN      12498
humph        1     NaN      NaN       8266
awake        1     NaN      NaN      15196
...        ...     ...      ...        ...
zuzim        0     NaN      NaN       8680
zygal        0     NaN      NaN      13513
zygon        0     NaN      NaN      11542
zymes        0     NaN      NaN      178

In [80]:
def normalize_data():
    normal = normalize(wordle.data)
    normal["entropy"] = abs(normal["entropy"] - 1)
    return normal
    
print(sort(normalize_data(), "entropy"))

       answer?   entropy  position  frequency
soare      0.0  1.000000  0.966040   1.000000
roate      0.0  0.999204  0.768064   0.848287
raise      1.0  0.997984  0.779624   0.969432
raile      0.0  0.994930  0.773121   0.821141
reast      0.0  0.994867  0.640896   0.948544
...        ...       ...       ...        ...
yukky      0.0  0.078473  0.312139   0.017557
xylyl      0.0  0.075152  0.074422   0.001486
immix      0.0  0.040250  0.078035   0.014541
jujus      0.0  0.036672  0.098266   0.169585
qajaq      0.0  0.000000  0.218208   0.031198

[12972 rows x 4 columns]


In [85]:
wordle.next_round("soare", [0, 0, 0, 0, 0])


       answer?   entropy  position  frequency
cigar      NaN  6.195001       842      17579
rebut      NaN  6.224752       739      18253
sissy      NaN  8.004482      1183      12498
humph      1.0  8.075660       505       8266
awake      NaN  7.202027       971      15196
...        ...       ...       ...        ...
zuzim      0.0  8.465882       400       8680
zygal      NaN  7.430702       412      13513
zygon      NaN  7.677150       355      11542
zymes      NaN  7.440347       441      17811
zymic      0.0  7.911750       276      10271

[12972 rows x 4 columns]
       answer?   entropy  position  frequency
cigar      NaN  6.195001       842        NaN
rebut      NaN  6.224752       739        NaN
sissy      NaN  8.004482      1183        NaN
humph      1.0  8.075660       505      706.0
awake      NaN  7.202027       971        NaN
...        ...       ...       ...        ...
zuzim      0.0  8.465882       400      894.0
zygal      NaN  7.430702       412        NaN
zygon   

In [None]:
wordle.next_round("soare", [0, 0, 0, 0, 0])

In [87]:
print_data()

       answer?   entropy  position  frequency
clint      0.0  1.000000  0.612121   0.801289
clipt      0.0  0.954221  0.539394   0.726101
linch      0.0  0.937642  0.745455   0.742213
mulct      0.0  0.933068  0.618182   0.544576
pilch      0.0  0.924987  0.745455   0.667025
...        ...       ...       ...        ...
vivid      1.0  0.106355  0.448485   0.079484
kudzu      0.0  0.092460  0.230303   0.021482
whizz      0.0  0.091586  0.248485   0.177229
xylyl      0.0  0.005379  0.030303   0.020408
immix      0.0  0.000000  0.109091   0.078410

[577 rows x 4 columns]
