# Brute Force with Numpy

In [1]:
import json
import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm.notebook import tqdm

## Load Data

In [2]:
with open('data/wordle-candidates.json', 'r') as file:
    wordle_candidates = json.load(file)
    
with open('data/wordle-answers.json', 'r') as file:
    wordle_answers = json.load(file)

wordle_candidates = pd.DataFrame(wordle_candidates['words'], columns=['word'])
wordle_answers = pd.DataFrame(wordle_answers['words'], columns=['word'])
wordle_candidates['is_answer'] = 0
wordle_answers['is_answer'] = 1
wordle = wordle_candidates.append(wordle_answers).reset_index(drop=True)

In [3]:
words_all = pd.read_table('data/archive/en_words_1_5-5.txt', delimiter=' ', header=None, index_col=None,
                         names=['word_len', 'word_freq', 'n_articles']).reset_index()
words_all = words_all.rename(columns={'index': 'word'})

# Filter by english
alphabet = list('abcdefghijklmnopqrstuvwxyz')
words_all = words_all.loc[words_all.word.apply(lambda x: all([l in alphabet for l in x]))].reset_index(drop=True)

## Prepare Artifacts

In [4]:
alpha_dict = {l: i for i, l in enumerate(list('abcdefghijklmnopqrstuvwxyz'))}

In [805]:
# Initialise solutions vector
solutions = np.zeros((wordle.shape[0], 26, 5), dtype='int8')
for i, word in enumerate(wordle.word):
    for j, l in enumerate(word):
        solutions[i, alpha_dict[l], j] = 1

## Game Logic

In [806]:
def get_feedback(input_word, solution):
    output = ''
    for i in range(5):
        if input_word[i] == solution[i]:
            output += 'G'
        elif input_word[i] in solution:
            output += 'Y'
        else:
            output += 'X'
    return output

In [807]:
def filter_wordset(input_word, feedback, wordset):
    newset = wordset.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newset = newset.loc[newset.word.str[i] == input_word[i]]
        elif feedback[i] == 'Y':
            # newset = newset.loc[newset.word.str.contains(input_word[i])]
            newset = newset.loc[newset.word.str.contains(input_word[i]) & newset.word.apply(lambda x: x[i] != input_word[i])]
        else:
            newset = newset.loc[~newset.word.str.contains(input_word[i])]
    return newset

## Vector Ops

In [808]:
def init_vec(word):
    mat = np.zeros((26, 5), dtype='int8')
    for i, l in enumerate(word):
        mat[alpha_dict[l], i] = 1
    return mat

In [809]:
def get_scores(word, mask):
    word_vec = init_vec(word)
    solutions_masked = solutions[mask]
    greens = solutions_masked * word_vec
    yellows = word_vec * (
        (solutions_masked.sum(axis=2) >= word_vec.sum(axis=1)) & 
        (word_vec.sum(axis=1) > 0)) \
        .reshape(np.sum(mask), 26, 1) - greens
    greys = word_vec - greens - yellows
    scores = np.array([np.sum(greens, axis=(1,2)), np.sum(yellows, axis=(1,2)), np.sum(greys, axis=(1,2))]).T
    # scores = []
    # for i in np.array(range(solutions.shape[0]))[mask]:
    #     solution = solutions[i]
    #     greens = solution * word_vec
    #     yellows = word_vec * ((solution.sum(axis=1) >= word_vec.sum(axis=1)) & (word_vec.sum(axis=1) > 0)).reshape(26, 1) - greens
    #     greys = word_vec - greens - yellows
    #     scores.append((np.sum(greens), np.sum(yellows), np.sum(greys)))
        
    return scores

In [810]:
def get_final_scores(word, mask):
    scores = get_scores(word, mask)
    df_scores = pd.DataFrame(scores, columns=['g', 'y', 'x'])
    df_scores['score'] = df_scores.g * 2 + df_scores.y
    
    return df_scores.score.mean()

In [811]:
def get_n_cands(word, filter_mask, candidate_mask):
    return np.sum((np.sum((filter_mask * solutions[candidate_mask]) == solutions[candidate_mask], axis=(-2,-1)) == 130))

### Filter Mask

In [812]:
def init_candidate_mask():
    return np.array([True] * wordle.shape[0])

In [813]:
def init_filter_mask():
    filter_mask = np.ones((26,5))
    return filter_mask

In [814]:
def update_filter_mask(input_word, feedback, mask):
    wv = init_vec(input_word)
    row_idx = [alpha_dict[l] for l in input_word]
    output = mask.copy()
    for i, (fb, r) in enumerate(zip(feedback, row_idx)):
        # Green
        if fb == 'G':
            output[:, i] = 0
            output[r, i] = 1
        # Yellow
        elif fb == 'Y':
            output[r, i] = 0
        # Grey
        elif fb == 'X':
            output[r, :] = 0
    return output

In [815]:
def update_candidate_mask(input_word, feedback, wordset, mask):
    newmask = mask.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newmask[~wordset.word.str[i].eq(input_word[i])] = False
        elif feedback[i] == 'Y':
            newmask[~(wordset.word.str.contains(input_word[i]) & wordset.word.apply(lambda x: x[i] != input_word[i]))] = False
        elif feedback[i] == 'X':
            newmask[wordset.word.str.contains(input_word[i])] = False
            
    return newmask

In [407]:
input_word = 'soare'
solution = 'humph'
fb = get_feedback(input_word, solution)
filter_mask = init_filter_mask()
filter_mask = update_filter_mask(input_word, fb, filter_mask)

In [408]:
candidate_mask = init_candidate_mask()
eval_mat = solutions[candidate_mask].copy()
eval_mat = eval_mat.reshape(np.sum(candidate_mask), 130)

In [409]:
get_n_cands(input_word, filter_mask, candidate_mask)

577

In [413]:
fm = filter_mask.ravel()

In [425]:
np.sum(eval_mat == fm, axis=1)

30

In [254]:
np.sum((np.sum((filter_mask * solutions[candidate_mask]) == solutions[candidate_mask], axis=(-2,-1)) == 130))

577

## Global Scores

In [128]:
def compute_letter_frequencies(wordset):
    w = wordset.copy()
    for letter in list('abcdefghijklmnopqrstuvwxyz'):
        w[letter] = w.word.str.contains(letter).astype(int)
    return w.iloc[:, 1:]

def compute_score(x, freqs):
    letters = set(x)
    output = 0
    for letter in letters:
        output += freqs[letter]
    return output

In [129]:
global_freqs = compute_letter_frequencies(wordle).sum().to_dict()
global_scores = wordle.word.apply(compute_score, freqs=global_freqs)
global_scores = pd.DataFrame({'word': wordle.word, 'score': global_scores})
global_scores = global_scores.merge(words_all[['word', 'word_freq', 'n_articles']], how='left', left_on='word', right_on='word')
global_scores = global_scores.fillna(0).sort_values(['score', 'word_freq'], ascending=False)

## App

In [428]:
def run_app(pre_load=None):
    candidate_mask = init_candidate_mask()
    filter_mask = init_filter_mask()
    step = 1
    w = wordle.copy()
    res = pd.DataFrame([{'a': 1}, {'a': 1}])
    tested_words = []
    all_chars = []
    
    while res.shape[0] > 1:
        print(f'[ ---- STEP {step} ----]')
        if not (step == 1 and pre_load):
            guess = input('Input a guess:')
        else:
            guess = pre_load
        if guess.lower() in ['quit', 'q']:
            return w, candidate_mask, res
        tested_words.append(guess)
        all_chars = all_chars + list(set(guess))
        all_chars = list(set(all_chars))
        
        fb = input('Input feedback:')
        if fb.lower() in ['quit', 'q']:
            return w, candidate_mask, res
        
        # Update masks and candidate set
        candidate_mask = update_candidate_mask(guess, fb.upper(), wordle, candidate_mask)
        filter_mask = update_filter_mask(guess, fb, filter_mask)
        w = filter_wordset(guess, fb.upper(), w)
        
        # Candidates
        print(f'Found {w.shape[0]} candidates. Running analysis...')
        new_scores = []
        new_ncands = []
        
        # eval_mat = solutions[candidate_mask].copy()
        
        for word in tqdm(w.word):
            # Get G/Y/X scores
            new_scores.append(get_final_scores(word, candidate_mask))
            
            # Get projected filter mask
#             temp_cands = 0
#             for solution in w.word:
#                 proj_fb = get_feedback(word, solution)
#                 proj_filter_mask = update_filter_mask(word, proj_fb, filter_mask)
#                 temp_cands += get_n_cands(word, proj_filter_mask, candidate_mask)
            
#             new_ncands.append(np.mean(temp_cands / w.shape[0]))
            
        
        print(f'Suggestions for step {step + 1} ({w.shape[0]}):')
        res = pd.DataFrame({'word': w.word, 'score': new_scores}) \
            .merge(words_all[['word', 'word_freq']], on='word', how='left') \
            .fillna(0)
        # display(res.sort_values(['ncands', 'score', 'word_freq'], ascending=[True, False, False]).head(10))
        display(res.sort_values(['score', 'word_freq'], ascending=False).head(10))
        
        # Filters
        # if w.shape[0] > 10:
        #     print(f'\nLarge number of candidates found ({w.shape[0]}). We recommend filtering the candidates more:')
        #     display(
        #         global_scores.loc[global_scores.word.apply(lambda x: len(set(all_chars).intersection(set(list(x)))) < 1)] \
        #             .head(5)
        #     )
        if w.shape[0] <= 10:
            print(f'Small number of candidates remaining ({w.shape[0]}). We recommend choosing the most popular option:')
            display(res.sort_values(['word_freq', 'score'], ascending=False).head(5))
        
        # Check for repeats
        if w.shape[0] <= 8 and w.shape[0] >= 3:
            w_copy = res.sort_values(['word_freq', 'score'], ascending=False).copy()
            # Extract letters
            for i in range(5):
                w_copy[f'p{i}'] = w_copy.word.str[i]
            
            # Count the number of unique columns
            unique_mask = w_copy.iloc[:, -5:].nunique() > 1
            
            # If only 1, then recommend another word
            if unique_mask.sum() == 1:
                wc = wordle.copy()
                total_letters = w_copy.shape[0]
                wc['scores'] = 0
                wc['counts'] = 0
                for i, letter in enumerate(np.squeeze(w_copy[unique_mask.index[unique_mask]].values)):
                    wc['scores'] = wc['scores'] + (total_letters - i) * wc.word.str.contains(letter).astype(int)
                    wc['counts'] = wc['counts'] + wc.word.str.contains(letter).astype(int)
                    
                print(f'\nWords with only one letter differential detected. Consider filtering:')
                display(wc.loc[wc.counts.le(total_letters // 2 * 3)].sort_values('scores', ascending=False).head(5))

                
        step += 1
        print()
        
    return w, candidate_mask, res

## Run

In [None]:
curr_wordset, curr_mask, curr_res = run_app('abaka')

[ ---- STEP 1 ----]


Input feedback: xxxxx


Found 5942 candidates. Running analysis...


  0%|          | 0/5942 [00:00<?, ?it/s]

Suggestions for step 2 (5942):


Unnamed: 0,word,score,word_freq
4156,tores,2.950522,8.0
1950,lores,2.897173,89.0
3266,rotes,2.891114,63.0
3222,roles,2.881521,52091.0
3230,rones,2.874958,2.0
574,dores,2.87378,81.0
4121,toles,2.863514,50.0
3273,roues,2.859475,9.0
4134,tones,2.856951,2122.0
2868,pores,2.853921,681.0



[ ---- STEP 2 ----]


In [649]:
m = [True] * 12972

In [773]:
word_vec = init_vec('roate')
solutions_masked = solutions
greens = solutions_masked * word_vec
yellows = word_vec * (
    (solutions_masked.sum(axis=2) >= word_vec.sum(axis=1)) & 
    (word_vec.sum(axis=1) > 0)) \
    .reshape(np.sum(m), 26, 1) - greens
greys = word_vec - greens - yellows

filter_mask = np.ones((26, 5), dtype='int8')

# Update greens
filter_mask = np.where(np.expand_dims(greens.sum(axis=-2) == 1, axis=1), greens, filter_mask)

# Update yellows
filter_mask = np.where(np.expand_dims(yellows.sum(axis=-1)==1, -1), filter_mask - yellows, filter_mask)

# Update greys
filter_mask = np.where(np.expand_dims(greys.sum(axis=-1)==1, -1), 0, filter_mask)

In [784]:
def compute_cands(mask):
    m = mask.reshape(26, 5)
    return np.sum(np.sum(m * solutions == solutions, axis=(-2,-1))==130)

In [779]:
solutions_reshaped = solutions.reshape(12972, 130)
filter_mask_reshaped = filter_mask.reshape(12972, 130)
ncands = np.apply_along_axis(compute_cands, 1, filter_mask_reshaped)

In [786]:
get_feedback('roate', 'rebut')

'GXXYY'

In [799]:
m1 = filter_mask_reshaped[10658]

In [800]:
m1 = m1.reshape(26,5)

In [798]:
wordle.iloc[10658]

word         rebut
is_answer        1
Name: 10658, dtype: object

In [801]:
np.sum(np.sum(m1 * solutions == solutions, axis=(-2,-1)) == 130)

196

In [None]:
# Yellow check
print('[Y] No e:', t0.word.apply(lambda x: x.count('e') == 0).sum())
print('[Y] No t:', t0.word.apply(lambda x: x.count('t') == 0).sum())
print('[Y] Last letter is e:', t0.word.apply(lambda x: x[4] == 'e').sum())

# Green check
print('[G] Starts with r:', t0.word.apply(lambda x: x[0] == 'r').sum())

# Grey check
print('[X] Has o:', t0.word.str.contains('o').sum())
print('[X] Has a:', t0.word.str.contains('a').sum())

[Y] No e: 54
[Y] No t: 171
[Y] Last letter is e: 0
[G] Starts with r: 196
[X] Has o: 0
[X] Has a: 0


In [734]:
t1

Unnamed: 0,word,is_answer
7431,rebit,0
7438,recit,0
7469,reest,0
7494,reist,0
7501,relet,0
7503,relit,0
7508,remet,0
7539,resit,0
7548,retem,0
7552,revet,0


In [772]:
t1 = filter_wordset('roate', get_feedback('roate', 'rebut'), wordle)

Letter: r - Result: Green
Rebec: 1
Letter: o - Result: Grey
Rebec: 1
Letter: a - Result: Grey
Rebec: 1
Letter: t - Result: Yellow
       word  is_answer     y1    y2
7429  rebec          0  False  True
Rebec: 0
Letter: e - Result: Yellow
Empty DataFrame
Columns: [word, is_answer, y1, y2]
Index: []
Rebec: 0


In [767]:
t0.loc[~t0.word.isin(t1.word) & t0.word.apply(lambda x: x.count('e') > 0)].sort_values('word')

Unnamed: 0,word,is_answer
7429,rebec,0
11933,rebel,1
7430,rebid,0
10853,rebus,1
7433,rebuy,0
...,...,...
7787,runes,0
7797,ruses,0
7808,ryked,0
7809,rykes,0


In [695]:
t1

Unnamed: 0,word,is_answer
7431,rebit,0
7438,recit,0
7469,reest,0
7494,reist,0
7501,relet,0
7503,relit,0
7508,remet,0
7539,resit,0
7548,retem,0
7552,revet,0


In [771]:
def filter_wordset(input_word, feedback, wordset):
    newset = wordset.copy()
    for i in range(5):
        print(f'Letter: {input_word[i]} - Result: ', end='', flush=True)
        if feedback[i] == 'G':
            print('Green')
            newset = newset.loc[newset.word.str[i] == input_word[i]]
        elif feedback[i] == 'Y':
            print('Yellow')
            x1 = newset.copy()
            x1['y1'] = x1.word.str.contains(input_word[i])
            x1['y2'] = x1.word.apply(lambda x: x[i] != input_word[i])
            print(x1.loc[x1.word.eq('rebec')])
            # newset = newset.loc[newset.word.str.contains(input_word[i])]
            newset = newset.loc[newset.word.str.contains(input_word[i]) & newset.word.apply(lambda x: x[i] != input_word[i])]
        else:
            print('Grey')
            newset = newset.loc[~newset.word.str.contains(input_word[i])]
        print(f'Rebec: {newset.word.str.contains("rebec").sum()}')
    return newset

In [699]:
wordle.loc[10658]

word         rebut
is_answer        1
Name: 10658, dtype: object

In [616]:
pd.Series(t1).describe()

count    12972.000000
mean      1530.202128
std       1221.969436
min          1.000000
25%        544.000000
50%       1218.000000
75%       2055.000000
max       7666.000000
dtype: float64

In [647]:
wordle.loc[wordle.word.eq('rebut')]

Unnamed: 0,word,is_answer
10658,rebut,1


In [509]:
wordle.head()

Unnamed: 0,word,is_answer
0,aahed,0
1,aalii,0
2,aargh,0
3,aarti,0
4,abaca,0


In [518]:
(filter_mask[7654] * solutions)[7654] == solutions[7654]

array([[ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True],
       [ True,  True,  True,  Tr