# Brute Force App

In [1]:
import json
import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm.notebook import tqdm

## Load Data

In [2]:
with open('data/wordle-candidates.json', 'r') as file:
    wordle_candidates = json.load(file)
    
with open('data/wordle-answers.json', 'r') as file:
    wordle_answers = json.load(file)

wordle_candidates = pd.DataFrame(wordle_candidates['words'], columns=['word'])
wordle_answers = pd.DataFrame(wordle_answers['words'], columns=['word'])
wordle_candidates['is_answer'] = 0
wordle_answers['is_answer'] = 1
wordle = wordle_candidates.append(wordle_answers).reset_index(drop=True)

In [3]:
words_all = pd.read_table('data/archive/en_words_1_5-5.txt', delimiter=' ', header=None, index_col=None,
                         names=['word_len', 'word_freq', 'n_articles']).reset_index()
words_all = words_all.rename(columns={'index': 'word'})

# Filter by english
alphabet = list('abcdefghijklmnopqrstuvwxyz')
words_all = words_all.loc[words_all.word.apply(lambda x: all([l in alphabet for l in x]))].reset_index(drop=True)

## Prepare Artifacts

In [4]:
alpha_dict = {l: i for i, l in enumerate(list('abcdefghijklmnopqrstuvwxyz'))}

In [5]:
# Initialise solutions vector
solutions = np.zeros((wordle.shape[0], 26, 5))
for i, word in enumerate(wordle.word):
    for j, l in enumerate(word):
        solutions[i, alpha_dict[l], j] = 1

## Game Logic

In [6]:
def get_feedback(input_word, solution):
    output = ''
    for i in range(5):
        if input_word[i] == solution[i]:
            output += 'G'
        elif input_word[i] in solution:
            output += 'Y'
        else:
            output += 'X'
    return output

In [7]:
def filter_wordset(input_word, feedback, wordset):
    newset = wordset.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newset = newset.loc[newset.word.str[i] == input_word[i]]
        elif feedback[i] == 'Y':
            # newset = newset.loc[newset.word.str.contains(input_word[i])]
            newset = newset.loc[newset.word.str.contains(input_word[i]) & newset.word.apply(lambda x: x[i] != input_word[i])]
        else:
            newset = newset.loc[~newset.word.str.contains(input_word[i])]
    return newset

## Vector Ops

In [8]:
def init_vec(word):
    mat = np.zeros((26, 5))
    for i, l in enumerate(word):
        mat[alpha_dict[l], i] = 1
    return mat

In [9]:
def get_scores(word, mask):
    word_vec = init_vec(word)
    solutions_masked = solutions[mask]
    greens = solutions_masked * word_vec
    yellows = word_vec * (
        (solutions_masked.sum(axis=2) >= word_vec.sum(axis=1)) & 
        (word_vec.sum(axis=1) > 0)) \
        .reshape(np.sum(mask), 26, 1) - greens
    greys = word_vec - greens - yellows
    scores = np.array([np.sum(greens, axis=(1,2)), np.sum(yellows, axis=(1,2)), np.sum(greys, axis=(1,2))]).T
        
    return scores

## App

In [10]:
def get_final_scores(word, mask):
    scores = get_scores(word, mask)
    df_scores = pd.DataFrame(scores, columns=['g', 'y', 'x'])
    df_scores['score'] = df_scores.g * 2 + df_scores.y
    
    return df_scores.score.mean()

In [11]:
def update_mask(input_word, feedback, wordset, mask):
    newmask = mask.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newmask[~wordset.word.str[i].eq(input_word[i])] = False
        elif feedback[i] == 'Y':
            newmask[~(wordset.word.str.contains(input_word[i]) & wordset.word.apply(lambda x: x[i] != input_word[i]))] = False
        elif feedback[i] == 'X':
            newmask[wordset.word.str.contains(input_word[i])] = False
            
    return newmask

In [19]:
def run_sim(input_word, solution):
    mask = np.array([True] * wordle.shape[0])
    step = 1
    w = wordle.copy()
    res = pd.DataFrame([{'a': 1}, {'a': 1}])
    tested_words = []
    all_chars = []
    ncands = []
    
    while res.shape[0] > 1:
        if step == 1:
            guess = input_word
        tested_words.append(guess)
        all_chars = all_chars + list(set(guess))
        all_chars = list(set(all_chars))
        
        fb = get_feedback(guess, solution)
        
        mask = update_mask(guess, fb.upper(), wordle, mask)
        w = filter_wordset(guess, fb.upper(), w)
        ncands.append(w.shape[0])
        
        # Candidates
        new_scores = []
        for word in w.word:
            new_scores.append(get_final_scores(word, mask))
        
        res = pd.DataFrame({'word': w.word, 'score': new_scores}) \
            .merge(words_all[['word', 'word_freq']], on='word', how='left') \
            .fillna(0)
        
        # Filters
        if w.shape[0] > 10:
            guess = res.sort_values(['score', 'word_freq'], ascending=False).word.iloc[0]
        elif w.shape[0] <= 10:
            if w.shape[0] <= 8 and w.shape[0] >= 3:
                w_copy = res.sort_values(['word_freq', 'score'], ascending=False).copy()
                # Extract letters
                for i in range(5):
                    w_copy[f'p{i}'] = w_copy.word.str[i]

                # Count the number of unique columns
                unique_mask = w_copy.iloc[:, -5:].nunique() > 1

                # If only 1, then recommend another word
                if unique_mask.sum() == 1:
                    wc = wordle.copy()
                    total_letters = w_copy.shape[0]
                    wc['scores'] = 0
                    wc['counts'] = 0
                    for i, letter in enumerate(np.squeeze(w_copy[unique_mask.index[unique_mask]].values)):
                        wc['scores'] = wc['scores'] + (total_letters - i) * wc.word.str.contains(letter).astype(int)
                        wc['counts'] = wc['counts'] + wc.word.str.contains(letter).astype(int)

                    special_res = wc.loc[wc.counts.le(total_letters // 2 * 3)].sort_values('scores', ascending=False)
                    guess = special_res.word.iloc[0]
                else:
                    guess = res.sort_values(['word_freq', 'score'], ascending=False).word.iloc[0]
            else:
                guess = res.sort_values(['word_freq', 'score'], ascending=False).word.iloc[0]
        if not fb.upper() == 'GGGGG':
            step += 1
        
    return input_word, solution, step, ncands, tested_words

In [None]:
results = Parallel(n_jobs=5, verbose=1)(delayed(run_sim)(input_word, solution) \
                                       for input_word in ['soare', 'roate', 'raise'] \
                                       for solution in wordle_answers.word)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    8.3s
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed:   38.2s
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  1.5min
[Parallel(n_jobs=5)]: Done 790 tasks      | elapsed:  2.7min
[Parallel(n_jobs=5)]: Done 1240 tasks      | elapsed:  4.2min
[Parallel(n_jobs=5)]: Done 1790 tasks      | elapsed:  6.1min
[Parallel(n_jobs=5)]: Done 2440 tasks      | elapsed:  8.7min
[Parallel(n_jobs=5)]: Done 3190 tasks      | elapsed: 12.8min
[Parallel(n_jobs=5)]: Done 4040 tasks      | elapsed: 17.1min
[Parallel(n_jobs=5)]: Done 4990 tasks      | elapsed: 21.4min
[Parallel(n_jobs=5)]: Done 6040 tasks      | elapsed: 25.9min


In [None]:
df = pd.DataFrame(results, columns=['word', 'solution', 'steps', 'ncands', 'tested_words'])