# Brute Force with Numpy

In [1]:
import json
import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm.notebook import tqdm

## Load Data

In [3]:
with open('data/wordle-candidates.json', 'r') as file:
    wordle_candidates = json.load(file)
    
with open('data/wordle-answers.json', 'r') as file:
    wordle_answers = json.load(file)

wordle_candidates = pd.DataFrame(wordle_candidates['words'], columns=['word'])
wordle_answers = pd.DataFrame(wordle_answers['words'], columns=['word'])
wordle_candidates['is_answer'] = 0
wordle_answers['is_answer'] = 1
wordle = wordle_candidates.loc[wordle_candidates.word.apply(lambda x: len(x)==len(set(x)))].append(wordle_answers).reset_index(drop=True)

In [4]:
words_all = pd.read_table('data/archive/en_words_1_5-5.txt', delimiter=' ', header=None, index_col=None,
                         names=['word_len', 'word_freq', 'n_articles']).reset_index()
words_all = words_all.rename(columns={'index': 'word'})

# Filter by english
alphabet = list('abcdefghijklmnopqrstuvwxyz')
words_all = words_all.loc[words_all.word.apply(lambda x: all([l in alphabet for l in x]))].reset_index(drop=True)

In [129]:
words_all

Unnamed: 0,word,word_len,word_freq,n_articles
0,which,5,1220752,890394
1,first,5,1033698,751444
2,known,5,742591,654233
3,after,5,694687,537462
4,their,5,655785,443953
...,...,...,...,...
154840,showi,5,2,2
154841,gceap,5,2,1
154842,neroc,5,2,1
154843,hipep,5,2,1


## Prepare Artifacts

In [5]:
alpha_dict = {l: i for i, l in enumerate(list('abcdefghijklmnopqrstuvwxyz'))}

In [6]:
# Initialise solutions vector
solutions = np.zeros((wordle_answers.shape[0], 26, 5), dtype='int8')
for i, word in enumerate(wordle_answers.word):
    for j, l in enumerate(word):
        solutions[i, alpha_dict[l], j] = 1
        
candidates = np.zeros((wordle.shape[0], 26, 5), dtype='int8')
for i, word in enumerate(wordle.word):
    for j, l in enumerate(word):
        candidates[i, alpha_dict[l], j] = 1

## Game Logic

In [7]:
def get_feedback(input_word, solution):
    output = ''
    for i in range(5):
        if input_word[i] == solution[i]:
            output += 'G'
        elif input_word[i] in solution:
            output += 'Y'
        else:
            output += 'X'
    return output

In [8]:
def filter_wordset(input_word, feedback, wordset):
    newset = wordset.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newset = newset.loc[newset.word.str[i] == input_word[i]]
        elif feedback[i] == 'Y':
            newset = newset.loc[newset.word.str.contains(input_word[i]) & newset.word.apply(lambda x: x[i] != input_word[i])]
        else:
            newset = newset.loc[~newset.word.str.contains(input_word[i])]
    return newset

## Vector Ops

In [9]:
def init_vec(word):
    mat = np.zeros((26, 5), dtype='int8')
    for i, l in enumerate(word):
        mat[alpha_dict[l], i] = 1
    return mat

In [10]:
def get_scores(word, solutions_masked):
    word_vec = init_vec(word)
    greens = solutions_masked * word_vec
    yellows = word_vec * (
        (solutions_masked.sum(axis=2) >= word_vec.sum(axis=1)) & 
        (word_vec.sum(axis=1) > 0)) \
        .reshape(solutions_masked.shape[0], 26, 1) - greens
    greys = word_vec - greens - yellows
    scores = np.array([np.sum(greens, axis=(1,2)), np.sum(yellows, axis=(1,2)), np.sum(greys, axis=(1,2))]).T
        
    return scores

In [11]:
def get_final_scores(word, solutions_masked):
    scores = get_scores(word, solutions_masked)
    df_scores = pd.DataFrame(scores, columns=['g', 'y', 'x'])
    df_scores['score'] = df_scores.g * 2 + df_scores.y
    
    return df_scores.score.mean()

### Filter Mask

In [13]:
def init_solution_mask():
    return np.array([True] * wordle_answers.shape[0])

In [14]:
def update_solution_mask(input_word, feedback, wordset, mask):
    newmask = mask.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newmask[~wordset.word.str[i].eq(input_word[i])] = False
        elif feedback[i] == 'Y':
            newmask[~(wordset.word.str.contains(input_word[i]) & wordset.word.apply(lambda x: x[i] != input_word[i]))] = False
        elif feedback[i] == 'X':
            newmask[wordset.word.str.contains(input_word[i])] = False
            
    return newmask

## Global Scores

In [18]:
def compute_letter_frequencies(wordset):
    w = wordset.copy()
    for letter in list('abcdefghijklmnopqrstuvwxyz'):
        w[letter] = w.word.str.contains(letter).astype(int)
    return w.iloc[:, 1:]

def compute_score(x, freqs):
    letters = set(x)
    output = 0
    for letter in letters:
        output += freqs[letter]
    return output

In [19]:
global_freqs = compute_letter_frequencies(wordle).sum().to_dict()
global_scores = wordle.word.apply(compute_score, freqs=global_freqs)
global_scores = pd.DataFrame({'word': wordle.word, 'score': global_scores})
global_scores = global_scores.merge(words_all[['word', 'word_freq', 'n_articles']], how='left', left_on='word', right_on='word')
global_scores = global_scores.fillna(0).sort_values(['score', 'word_freq'], ascending=False)

## App

In [91]:
def run_app(pre_load=None):
    
    # Initialisation
    candidate_list = wordle.copy()
    solution_list = wordle_answers.copy()
    solution_mask = init_solution_mask()
    res = pd.DataFrame([{'a': 1}, {'a': 1}])
    
    step = 1
    tested_words = []
    all_chars = []
    
    while res.shape[0] > 1:
        print(f'[ ---- STEP {step} ----]')
        
        # Get guess
        if not (step == 1 and pre_load):
            guess = input('Input a guess:')
        else:
            guess = pre_load
        if guess.lower() in ['quit', 'q']:
            return res
        tested_words.append(guess)
        all_chars = all_chars + list(set(guess))
        all_chars = list(set(all_chars))
        
        # Get feedback
        fb = input('Input feedback:')
        if fb.lower() in ['quit', 'q']:
            return res
        
        # Update solution mask and list
        solution_mask = update_solution_mask(guess, fb.upper(), wordle_answers, solution_mask)
        solution_list = filter_wordset(guess, fb.upper(), solution_list)
        
        # Compute scores
        print(f'Evaluating all candidates. Running analysis...')
        new_scores = Parallel(n_jobs=5, verbose=1)(delayed(get_final_scores)(word, solutions[solution_mask]) \
                                                   for word in tqdm(candidate_list.word))
        
        # Generate results
        res = pd.DataFrame({'word': candidate_list.word, 'score': new_scores}) \
            .merge(words_all[['word', 'word_freq']], on='word', how='left') \
            .fillna(0)
        
        if solution_list.shape[0] > 0:        
            print(f'Suggestions for step {step + 1}:')
            display(res.sort_values(['score', 'word_freq'], ascending=False).head(10))
        
        if solution_list.shape[0] <= 10:
            print(f'Small number of candidates remaining ({solution_list.shape[0]}). We recommend choosing the most popular option:')
            res = res.loc[res.word.isin(solution_list.word)]
            display(res.sort_values(['word_freq', 'score'], ascending=False).head(5))
        
        # Check for repeats
        if solution_list.shape[0] <= 8 and solution_list.shape[0] >= 3:
            w_copy = res.loc[res.word.isin(solution_list.word)].sort_values(['word_freq', 'score'], ascending=False).copy()
            # Extract letters
            for i in range(5):
                w_copy[f'p{i}'] = w_copy.word.str[i]
            
            # Count the number of unique columns
            unique_mask = w_copy.iloc[:, -5:].nunique() > 1
            
            # If only 1, then recommend another word
            if unique_mask.sum() == 1:
                wc = wordle.copy()
                total_letters = w_copy.shape[0]
                wc['scores'] = 0
                wc['counts'] = 0
                for i, letter in enumerate(np.squeeze(w_copy[unique_mask.index[unique_mask]].values)):
                    wc['scores'] = wc['scores'] + (total_letters - i) * wc.word.str.contains(letter).astype(int)
                    wc['counts'] = wc['counts'] + wc.word.str.contains(letter).astype(int)
                    
                print(f'\nWords with only one letter differential detected. Consider filtering:')
                display(wc.loc[wc.counts.le(total_letters // 2 * 3)].sort_values('scores', ascending=False).head(5))

                
        step += 1
        print()
        
    return res

## Run

In [128]:
curr_res = run_app('soare')

[ ---- STEP 1 ----]


Input feedback: xxxyy


Evaluating all candidates. Running analysis...


  0%|          | 0/9071 [00:00<?, ?it/s]

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done 150 tasks      | elapsed:    0.1s


Suggestions for step 2:


[Parallel(n_jobs=5)]: Done 9062 out of 9071 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=5)]: Done 9071 out of 9071 | elapsed:    2.2s finished


Unnamed: 0,word,score,word_freq
8123,cider,4.435897,1093.0
1017,citer,4.42735,10.0
7414,diner,4.418803,1125.0
5849,tiler,4.410256,37.0
3381,liter,4.401709,431.0
1450,dicer,4.401709,62.0
4016,niter,4.376068,49.0
7661,diver,4.358974,3058.0
6909,finer,4.358974,681.0
8872,filer,4.358974,90.0



[ ---- STEP 2 ----]


Input a guess: rider
Input feedback: yxxgg


Evaluating all candidates. Running analysis...


  0%|          | 0/9071 [00:00<?, ?it/s]

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done 150 tasks      | elapsed:    0.1s


Suggestions for step 3:


[Parallel(n_jobs=5)]: Done 9062 out of 9071 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=5)]: Done 9071 out of 9071 | elapsed:    1.8s finished


Unnamed: 0,word,score,word_freq
7790,tuber,5.208333,394.0
3511,luter,5.208333,45.0
1301,cuter,5.125,20.0
6051,tuyer,5.125,0.0
7850,buyer,5.083333,1679.0
7607,bluer,5.083333,43.0
1262,cuber,5.083333,26.0
6177,upter,5.083333,0.0
3901,muter,5.041667,31.0
6993,ulcer,5.0,438.0



[ ---- STEP 3 ----]


Input a guess: q


## Sims

In [99]:
def run_sim(input_word, solution):

    # Initialisation
    candidate_list = wordle.copy()
    solution_list = wordle_answers.copy()
    solution_mask = init_solution_mask()
    res = pd.DataFrame([{'a': 1}, {'a': 1}])
    
    step = 1
    tested_words = []
    all_chars = []
    ncands = []
    
    while res.shape[0] > 1:
        # Get guess
        if step == 1:
            guess = input_word
        tested_words.append(guess)
        all_chars = all_chars + list(set(guess))
        all_chars = list(set(all_chars))
        
        # Get feedback
        fb = get_feedback(guess, solution)
        
        # Update solution mask and list
        solution_mask = update_solution_mask(guess, fb.upper(), wordle_answers, solution_mask)
        solution_list = filter_wordset(guess, fb.upper(), solution_list)
        ncands.append(solution_list.shape[0])
        # print(f'Step {step}: Guess: {guess} | Feedback: {fb} | Solutions: {solution_list.shape[0]}')
        # Compute scores
        new_scores = Parallel(n_jobs=5, verbose=0)(delayed(get_final_scores)(word, solutions[solution_mask]) \
                                                   for word in candidate_list.word)
        
        # Generate results
        res = pd.DataFrame({'word': candidate_list.word, 'score': new_scores}) \
            .merge(words_all[['word', 'word_freq']], on='word', how='left') \
            .fillna(0)
        
        if solution_list.shape[0] > 10:     
            # res = res.loc[res.word.isin(solution_list.word)]
            guess = res.sort_values(['score', 'word_freq'], ascending=False).word.iloc[0]
        
        # For smaller solution sets
        elif solution_list.shape[0] <= 10:
            if solution_list.shape[0] <= 8 and solution_list.shape[0] >= 3:
                w_copy = res.loc[res.word.isin(solution_list.word)].sort_values(['word_freq', 'score'], ascending=False).copy()
                # Extract letters
                for i in range(5):
                    w_copy[f'p{i}'] = w_copy.word.str[i]

                # Count the number of unique columns
                unique_mask = w_copy.iloc[:, -5:].nunique() > 1

                # If only 1, then recommend another word
                if unique_mask.sum() == 1:
                    wc = wordle.copy()
                    total_letters = w_copy.shape[0]
                    wc['scores'] = 0
                    wc['counts'] = 0
                    for i, letter in enumerate(np.squeeze(w_copy[unique_mask.index[unique_mask]].values)):
                        wc['scores'] = wc['scores'] + (total_letters - i) * wc.word.str.contains(letter).astype(int)
                        wc['counts'] = wc['counts'] + wc.word.str.contains(letter).astype(int)

                    special_res = wc.loc[wc.counts.le(total_letters // 2 * 3)].sort_values('scores', ascending=False)
                    guess = special_res.word.iloc[0]
                else:
                    res = res.loc[res.word.isin(solution_list.word)]
                    guess = res.sort_values(['word_freq', 'score'], ascending=False).word.iloc[0]
            else:
                res = res.loc[res.word.isin(solution_list.word)]
                guess = res.sort_values(['word_freq', 'score'], ascending=False).word.iloc[0]

        if not fb.upper() == 'GGGGG':
            step += 1
        
    return input_word, solution, step, ncands, tested_words

In [116]:
counter = 1
results = []
main_word = 'soare'

for word in tqdm(wordle_answers.word):
    if counter % 10 == 0:
        temp_df = pd.DataFrame(results, columns=['word', 'solution', 'steps', 'ncands', 'tested_words'])
        print(f'Game {counter}: {word} | Min: {temp_df.steps.min()} / Median: {temp_df.steps.median()} / Mean: {temp_df.steps.mean()} / Max: {temp_df.steps.max()}')
    
    results.append(run_sim(main_word, word))
    counter += 1

  0%|          | 0/2315 [00:00<?, ?it/s]

Game 10: serve | Min: 3 / Median: 4.0 / Mean: 4.0 / Max: 5
Game 20: feign | Min: 2 / Median: 4.0 / Mean: 3.6315789473684212 / Max: 5
Game 30: batty | Min: 2 / Median: 4.0 / Mean: 3.6206896551724137 / Max: 5
Game 40: outdo | Min: 2 / Median: 4.0 / Mean: 3.717948717948718 / Max: 5
Game 50: pound | Min: 2 / Median: 4.0 / Mean: 3.7346938775510203 / Max: 5
Game 60: ivory | Min: 2 / Median: 4.0 / Mean: 3.76271186440678 / Max: 5
Game 70: offal | Min: 2 / Median: 4.0 / Mean: 3.782608695652174 / Max: 6
Game 80: front | Min: 2 / Median: 4.0 / Mean: 3.7341772151898733 / Max: 6
Game 90: loopy | Min: 2 / Median: 4.0 / Mean: 3.730337078651685 / Max: 6
Game 100: moult | Min: 2 / Median: 4.0 / Mean: 3.717171717171717 / Max: 6
Game 110: guild | Min: 2 / Median: 4.0 / Mean: 3.7155963302752295 / Max: 6
Game 120: dutch | Min: 2 / Median: 4.0 / Mean: 3.7478991596638656 / Max: 6
Game 130: dozen | Min: 2 / Median: 4.0 / Mean: 3.744186046511628 / Max: 6
Game 140: blurt | Min: 2 / Median: 4.0 / Mean: 3.7985611

In [117]:
df = pd.DataFrame(results, columns=['word', 'solution', 'steps', 'ncands', 'tested_words'])

In [118]:
display(df.steps.value_counts())
display(df.steps.value_counts(normalize=True))

4    942
3    873
5    323
2    113
6     56
7      7
8      1
Name: steps, dtype: int64

4    0.406911
3    0.377106
5    0.139525
2    0.048812
6    0.024190
7    0.003024
8    0.000432
Name: steps, dtype: float64

In [119]:
df.steps.describe()

count    2315.000000
mean        3.723974
std         0.875731
min         2.000000
25%         3.000000
50%         4.000000
75%         4.000000
max         8.000000
Name: steps, dtype: float64

In [120]:
df.loc[df.steps >= 7]

Unnamed: 0,word,solution,steps,ncands,tested_words
310,soare,foyer,7,"[22, 8, 6, 5, 3, 2, 1]","[soare, vower, roger, boxer, poker, homer, foyer]"
384,soare,gawky,7,"[138, 16, 4, 3, 2, 1]","[soare, canty, badly, happy, jazzy, mammy]"
1376,soare,ember,7,"[117, 23, 8, 3, 2, 1]","[soare, cider, luter, never, hyper, freer]"
1553,soare,riper,7,"[117, 14, 6, 4, 2, 1]","[soare, cider, fiver, tiger, liner, piper]"
1870,soare,waver,7,"[61, 15, 6, 4, 2, 1]","[soare, later, pager, baker, racer, wafer]"
2015,soare,sappy,7,"[19, 5, 4, 3, 2, 1]","[soare, salty, sandy, savvy, sassy, saucy]"
2036,soare,corer,8,"[22, 8, 6, 5, 3, 2, 1]","[soare, vower, roger, boxer, poker, homer, foyer]"
2280,soare,vaunt,7,"[138, 10, 6, 3, 2, 1]","[soare, canty, paint, dight, jetty, taunt]"


In [121]:
df.to_csv('results/bf_soare.csv', index=False)

In [None]:
counter = 1
test1 = []
main_word = 'soare'

for word in tqdm(wordle_answers.word):
    if counter % 10 == 0:
        temp_df = pd.DataFrame(results, columns=['word', 'solution', 'steps', 'ncands', 'tested_words'])
        print(f'Game {counter}: {word} | Min: {temp_df.steps.min()} / Median: {temp_df.steps.median()} / Mean: {temp_df.steps.mean()} / Max: {temp_df.steps.max()}')
    
    test1.append(run_sim(main_word, word))
    counter += 1