# Brute Force with Numpy

In [1]:
import json
import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm.notebook import tqdm

## Load Data

In [2]:
with open('data/wordle-candidates.json', 'r') as file:
    wordle_candidates = json.load(file)
    
with open('data/wordle-answers.json', 'r') as file:
    wordle_answers = json.load(file)

wordle_candidates = pd.DataFrame(wordle_candidates['words'], columns=['word'])
wordle_answers = pd.DataFrame(wordle_answers['words'], columns=['word'])
wordle_candidates['is_answer'] = 0
wordle_answers['is_answer'] = 1
wordle = wordle_candidates.append(wordle_answers).reset_index(drop=True)

In [3]:
words_all = pd.read_table('data/archive/en_words_1_5-5.txt', delimiter=' ', header=None, index_col=None,
                         names=['word_len', 'word_freq', 'n_articles']).reset_index()
words_all = words_all.rename(columns={'index': 'word'})

# Filter by english
alphabet = list('abcdefghijklmnopqrstuvwxyz')
words_all = words_all.loc[words_all.word.apply(lambda x: all([l in alphabet for l in x]))].reset_index(drop=True)

## Prepare Artifacts

In [4]:
alpha_dict = {l: i for i, l in enumerate(list('abcdefghijklmnopqrstuvwxyz'))}

In [5]:
# Initialise solutions vector
solutions = np.zeros((wordle.shape[0], 26, 5))
for i, word in enumerate(wordle.word):
    for j, l in enumerate(word):
        solutions[i, alpha_dict[l], j] = 1

## Game Logic

In [6]:
def get_feedback(input_word, solution):
    output = ''
    for i in range(5):
        if input_word[i] == solution[i]:
            output += 'G'
        elif input_word[i] in solution:
            output += 'Y'
        else:
            output += 'X'
    return output

In [7]:
def filter_wordset(input_word, feedback, wordset):
    newset = wordset.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newset = newset.loc[newset.word.str[i] == input_word[i]]
        elif feedback[i] == 'Y':
            # newset = newset.loc[newset.word.str.contains(input_word[i])]
            newset = newset.loc[newset.word.str.contains(input_word[i]) & newset.word.apply(lambda x: x[i] != input_word[i])]
        else:
            newset = newset.loc[~newset.word.str.contains(input_word[i])]
    return newset

## Vector Ops

In [8]:
def init_vec(word):
    mat = np.zeros((26, 5))
    for i, l in enumerate(word):
        mat[alpha_dict[l], i] = 1
    return mat

In [9]:
def get_scores(word, mask):
    scores = []
    word_vec = init_vec(word)
    for i in np.array(range(solutions.shape[0]))[mask]:
        solution = solutions[i]
        greens = solution * word_vec
        yellows = word_vec * ((solution.sum(axis=1) >= word_vec.sum(axis=1)) & (word_vec.sum(axis=1) > 0)).reshape(26, 1) - greens
        greys = word_vec - greens - yellows
        scores.append((np.sum(greens), np.sum(yellows), np.sum(greys)))
        
    return scores

## Global Scores

In [10]:
def compute_letter_frequencies(wordset):
    w = wordset.copy()
    for letter in list('abcdefghijklmnopqrstuvwxyz'):
        w[letter] = w.word.str.contains(letter).astype(int)
    return w.iloc[:, 1:]

def compute_score(x, freqs):
    letters = set(x)
    output = 0
    for letter in letters:
        output += freqs[letter]
    return output

In [11]:
global_freqs = compute_letter_frequencies(wordle).sum().to_dict()
global_scores = wordle.word.apply(compute_score, freqs=global_freqs)
global_scores = pd.DataFrame({'word': wordle.word, 'score': global_scores})
global_scores = global_scores.merge(words_all[['word', 'word_freq', 'n_articles']], how='left', left_on='word', right_on='word')
global_scores = global_scores.fillna(0).sort_values(['score', 'word_freq'], ascending=False)

## App

In [12]:
def get_final_scores(word, mask):
    scores = get_scores(word, mask)
    df_scores = pd.DataFrame(scores, columns=['g', 'y', 'x'])
    df_scores['score'] = df_scores.g * 2 + df_scores.y
    
    return df_scores.score.mean()

In [13]:
def update_mask(input_word, feedback, wordset, mask):
    newmask = mask.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newmask[~wordset.word.str[i].eq(input_word[i])] = False
        elif feedback[i] == 'Y':
            newmask[~(wordset.word.str.contains(input_word[i]) & wordset.word.apply(lambda x: x[i] != input_word[i]))] = False
        elif feedback[i] == 'X':
            newmask[wordset.word.str.contains(input_word[i])] = False
            
    return newmask

In [22]:
def run_app(pre_load=None):
    mask = np.array([True] * wordle.shape[0])
    step = 1
    w = wordle.copy()
    res = pd.DataFrame([{'a': 1}, {'a': 1}])
    tested_words = []
    all_chars = []
    
    while res.shape[0] > 1:
        print(f'[ ---- STEP {step} ----]')
        if not (step == 1 and pre_load):
            guess = input('Input a guess:')
        else:
            guess = pre_load
        if guess.lower() in ['quit', 'q']:
            return w, mask, res
        tested_words.append(guess)
        all_chars = all_chars + list(set(guess))
        all_chars = list(set(all_chars))
        
        fb = input('Input feedback:')
        if fb.lower() in ['quit', 'q']:
            return w, mask, res
        
        mask = update_mask(guess, fb.upper(), wordle, mask)
        w = filter_wordset(guess, fb.upper(), w)
        
        # Candidates
        print(f'Found {w.shape[0]} candidates. Running analysis...')
        new_scores = Parallel(n_jobs=4, verbose=0)(delayed(get_final_scores)(word, mask) for word in tqdm(w.word))
        
        print(f'Suggestions for step {step + 1}:')
        res = pd.DataFrame({'word': w.word, 'score': new_scores}) \
            .merge(words_all[['word', 'word_freq']], on='word', how='left') \
            .fillna(0)
        display(res.sort_values(['score', 'word_freq'], ascending=False).head(10))
        
        # Filters
        # if w.shape[0] > 10:
        #     print(f'\nLarge number of candidates found ({w.shape[0]}). We recommend filtering the candidates more:')
        #     display(
        #         global_scores.loc[global_scores.word.apply(lambda x: len(set(all_chars).intersection(set(list(x)))) < 1)] \
        #             .head(5)
        #     )
        if w.shape[0] <= 10:
            print(f'Small number of candidates remaining ({w.shape[0]}). We recommend choosing the most popular option:')
            display(res.sort_values(['word_freq', 'score'], ascending=False).head(5))
        
        # Check for repeats
        if w.shape[0] <= 8 and w.shape[0] >= 3:
            w_copy = res.sort_values(['word_freq', 'score'], ascending=False).copy()
            # Extract letters
            for i in range(5):
                w_copy[f'p{i}'] = w_copy.word.str[i]
            
            # Count the number of unique columns
            unique_mask = w_copy.iloc[:, -5:].nunique() > 1
            
            # If only 1, then recommend another word
            if unique_mask.sum() == 1:
                wc = wordle.copy()
                total_letters = w_copy.shape[0]
                wc['scores'] = 0
                wc['counts'] = 0
                for i, letter in enumerate(np.squeeze(w_copy[unique_mask.index[unique_mask]].values)):
                    wc['scores'] = wc['scores'] + (total_letters - i) * wc.word.str.contains(letter).astype(int)
                    wc['counts'] = wc['counts'] + wc.word.str.contains(letter).astype(int)
                    
                print(f'\nWords with only one letter differential detected. Consider filtering:')
                display(wc.loc[wc.counts.le(total_letters // 2 * 3)].sort_values('scores', ascending=False).head(5))

                
        step += 1
        print()
        
    return w, mask, res

In [None]:
curr_wordset, curr_mask, curr_res = run_app('tales')

[ ---- STEP 1 ----]


Input feedback: xxyxx


Found 263 candidates. Running analysis...


  0%|          | 0/263 [00:00<?, ?it/s]

Suggestions for step 2:


Unnamed: 0,word,score,word_freq
125,noily,3.444867,0.0
152,roily,3.444867,0.0
33,doily,3.414449,13.0
104,loury,3.30038,21.0
190,glory,3.247148,6237.0
42,flory,3.190114,137.0
144,poilu,3.190114,13.0
56,globy,3.152091,2.0
258,cloud,3.106464,10068.0
4,bliny,3.106464,4.0



[ ---- STEP 2 ----]


Input a guess: noily
Input feedback: xyxyx


Found 32 candidates. Running analysis...


  0%|          | 0/32 [00:00<?, ?it/s]

Suggestions for step 3:


Unnamed: 0,word,score,word_freq
7,clour,5.15625,0.0
28,flour,5.03125,3776.0
23,floor,4.96875,23077.0
6,cloop,4.96875,5.0
22,flock,4.90625,1081.0
25,block,4.875,31744.0
31,cloud,4.875,10068.0
15,plouk,4.875,0.0
0,blook,4.84375,11.0
1,bloop,4.8125,24.0



[ ---- STEP 3 ----]


Input a guess: clour
Input feedback: gggxx


Found 5 candidates. Running analysis...


  0%|          | 0/5 [00:00<?, ?it/s]

Suggestions for step 4:


Unnamed: 0,word,score,word_freq
2,clomp,7.6,0.0
1,clomb,7.2,0.0
0,cloff,6.8,0.0
3,cloop,6.4,5.0
4,clock,6.0,7889.0


Small number of candidates remaining (5). We recommend choosing the most popular option:


Unnamed: 0,word,score,word_freq
4,clock,6.0,7889.0
3,cloop,6.4,5.0
2,clomp,7.6,0.0
1,clomb,7.2,0.0
0,cloff,6.8,0.0



[ ---- STEP 4 ----]
