# Brute Force with Numpy

In [1]:
import json
import numpy as np
import pandas as pd

from joblib import Parallel, delayed
from tqdm.notebook import tqdm

## Load Data

In [2]:
with open('data/wordle-candidates.json', 'r') as file:
    wordle_candidates = json.load(file)
    
with open('data/wordle-answers.json', 'r') as file:
    wordle_answers = json.load(file)

wordle_candidates = pd.DataFrame(wordle_candidates['words'], columns=['word'])
wordle_answers = pd.DataFrame(wordle_answers['words'], columns=['word'])
wordle_candidates['is_answer'] = 0
wordle_answers['is_answer'] = 1
wordle = wordle_candidates.append(wordle_answers).reset_index(drop=True)

In [3]:
words_all = pd.read_table('data/archive/en_words_1_5-5.txt', delimiter=' ', header=None, index_col=None,
                         names=['word_len', 'word_freq', 'n_articles']).reset_index()
words_all = words_all.rename(columns={'index': 'word'})

# Filter by english
alphabet = list('abcdefghijklmnopqrstuvwxyz')
words_all = words_all.loc[words_all.word.apply(lambda x: all([l in alphabet for l in x]))].reset_index(drop=True)

## Prepare Artifacts

In [4]:
alpha_dict = {l: i for i, l in enumerate(list('abcdefghijklmnopqrstuvwxyz'))}

In [5]:
# Initialise solutions vector
solutions = np.zeros((wordle.shape[0], 26, 5))
for i, word in enumerate(wordle.word):
    for j, l in enumerate(word):
        solutions[i, alpha_dict[l], j] = 1

## Game Logic

In [6]:
def get_feedback(input_word, solution):
    output = ''
    for i in range(5):
        if input_word[i] == solution[i]:
            output += 'G'
        elif input_word[i] in solution:
            output += 'Y'
        else:
            output += 'X'
    return output

In [7]:
def filter_wordset(input_word, feedback, wordset):
    newset = wordset.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newset = newset.loc[newset.word.str[i] == input_word[i]]
        elif feedback[i] == 'Y':
            # newset = newset.loc[newset.word.str.contains(input_word[i])]
            newset = newset.loc[newset.word.str.contains(input_word[i]) & newset.word.apply(lambda x: x[i] != input_word[i])]
        else:
            newset = newset.loc[~newset.word.str.contains(input_word[i])]
    return newset

In [294]:
# ncands = []
# mod_greens = np.where((greens.sum(axis=-2) == 0).reshape((np.sum(mask), 1, 5)), np.ones((np.sum(mask),26,1)), greens)
# mod_yellows = yellows[i].sum(axis=-1)>= solutions[mask]
# mod_greys = 1 - np.where(greys.sum(axis=-1).reshape((np.sum(mask),26, 1)) >= 1, np.ones(greys.shape), greys)
# for i in tqdm(range(np.sum(mask))):
    
#     check_greens = np.all(mod_greens * solutions[mask] == solutions[mask], axis=(1,2))
#     check_yellows = np.all(mod_yellows * solutions[mask] == solutions[mask], axis=(1,2))
#     check_greys = np.all(mod_greys * solutions[mask] == solutions[mask], axis=(1,2))
#     filtered = np.all(np.column_stack((check_greens, check_yellows, check_greys)), axis=1).sum()
    
#     ncands.append(filtered)

# t1 = pd.DataFrame({'word': wordle.word, 'ncands': ncands})
# print(t1.ncands.describe())

# t2 = get_scores('pares', mask)
# df2 = pd.DataFrame(t2, columns=['g', 'y', 'x'])
# df2['score'] = df2.g * 2 + df2.y
# df2['ncands'] = t1.ncands
# t1.loc[t1.ncands.eq(t1.ncands.max())]

## Vector Ops

In [8]:
def init_vec(word):
    mat = np.zeros((26, 5))
    for i, l in enumerate(word):
        mat[alpha_dict[l], i] = 1
    return mat

In [34]:
def get_scores(word, mask):
    word_vec = init_vec(word)
    solutions_masked = solutions[mask]
    greens = solutions_masked * word_vec
    yellows = word_vec * (
        (solutions_masked.sum(axis=2) >= word_vec.sum(axis=1)) & 
        (word_vec.sum(axis=1) > 0)) \
        .reshape(np.sum(mask), 26, 1) - greens
    greys = word_vec - greens - yellows
    scores = np.array([np.sum(greens, axis=(1,2)), np.sum(yellows, axis=(1,2)), np.sum(greys, axis=(1,2))]).T
    # scores = []
    # for i in np.array(range(solutions.shape[0]))[mask]:
    #     solution = solutions[i]
    #     greens = solution * word_vec
    #     yellows = word_vec * ((solution.sum(axis=1) >= word_vec.sum(axis=1)) & (word_vec.sum(axis=1) > 0)).reshape(26, 1) - greens
    #     greys = word_vec - greens - yellows
    #     scores.append((np.sum(greens), np.sum(yellows), np.sum(greys)))
        
    return scores

## Global Scores

In [10]:
def compute_letter_frequencies(wordset):
    w = wordset.copy()
    for letter in list('abcdefghijklmnopqrstuvwxyz'):
        w[letter] = w.word.str.contains(letter).astype(int)
    return w.iloc[:, 1:]

def compute_score(x, freqs):
    letters = set(x)
    output = 0
    for letter in letters:
        output += freqs[letter]
    return output

In [11]:
global_freqs = compute_letter_frequencies(wordle).sum().to_dict()
global_scores = wordle.word.apply(compute_score, freqs=global_freqs)
global_scores = pd.DataFrame({'word': wordle.word, 'score': global_scores})
global_scores = global_scores.merge(words_all[['word', 'word_freq', 'n_articles']], how='left', left_on='word', right_on='word')
global_scores = global_scores.fillna(0).sort_values(['score', 'word_freq'], ascending=False)

## App

In [12]:
def get_final_scores(word, mask):
    scores = get_scores(word, mask)
    df_scores = pd.DataFrame(scores, columns=['g', 'y', 'x'])
    df_scores['score'] = df_scores.g * 2 + df_scores.y
    
    return df_scores.score.mean()

In [13]:
def update_mask(input_word, feedback, wordset, mask):
    newmask = mask.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newmask[~wordset.word.str[i].eq(input_word[i])] = False
        elif feedback[i] == 'Y':
            newmask[~(wordset.word.str.contains(input_word[i]) & wordset.word.apply(lambda x: x[i] != input_word[i]))] = False
        elif feedback[i] == 'X':
            newmask[wordset.word.str.contains(input_word[i])] = False
            
    return newmask

In [14]:
def compute_n_filtered(input_word, solution):
    feedback = get_feedback(input_word, solution)
    candidates = filter_wordset(input_word, feedback, wordle)
    return candidates.shape[0]

In [15]:
def run_app(pre_load=None):
    mask = np.array([True] * wordle.shape[0])
    step = 1
    w = wordle.copy()
    res = pd.DataFrame([{'a': 1}, {'a': 1}])
    tested_words = []
    all_chars = []
    
    while res.shape[0] > 1:
        print(f'[ ---- STEP {step} ----]')
        if not (step == 1 and pre_load):
            guess = input('Input a guess:')
        else:
            guess = pre_load
        if guess.lower() in ['quit', 'q']:
            return w, mask, res
        tested_words.append(guess)
        all_chars = all_chars + list(set(guess))
        all_chars = list(set(all_chars))
        
        fb = input('Input feedback:')
        if fb.lower() in ['quit', 'q']:
            return w, mask, res
        
        mask = update_mask(guess, fb.upper(), wordle, mask)
        w = filter_wordset(guess, fb.upper(), w)
        
        # Candidates
        print(f'Found {w.shape[0]} candidates. Running analysis...')
        new_scores = []
        # new_ncands = []
        for word in tqdm(w.word):
            new_scores.append(get_final_scores(word, mask))
            # temp_word = []
            # for s in wordle_answers.word:
            #     temp_word.append(compute_n_filtered(word, s))
        
        print(f'Suggestions for step {step + 1}:')
        res = pd.DataFrame({'word': w.word, 'score': new_scores}) \
            .merge(words_all[['word', 'word_freq']], on='word', how='left') \
            .fillna(0)
        display(res.sort_values(['score', 'word_freq'], ascending=False).head(10))
        
        # Filters
        # if w.shape[0] > 10:
        #     print(f'\nLarge number of candidates found ({w.shape[0]}). We recommend filtering the candidates more:')
        #     display(
        #         global_scores.loc[global_scores.word.apply(lambda x: len(set(all_chars).intersection(set(list(x)))) < 1)] \
        #             .head(5)
        #     )
        if w.shape[0] <= 10:
            print(f'Small number of candidates remaining ({w.shape[0]}). We recommend choosing the most popular option:')
            display(res.sort_values(['word_freq', 'score'], ascending=False).head(5))
        
        # Check for repeats
        if w.shape[0] <= 8 and w.shape[0] >= 3:
            w_copy = res.sort_values(['word_freq', 'score'], ascending=False).copy()
            # Extract letters
            for i in range(5):
                w_copy[f'p{i}'] = w_copy.word.str[i]
            
            # Count the number of unique columns
            unique_mask = w_copy.iloc[:, -5:].nunique() > 1
            
            # If only 1, then recommend another word
            if unique_mask.sum() == 1:
                wc = wordle.copy()
                total_letters = w_copy.shape[0]
                wc['scores'] = 0
                wc['counts'] = 0
                for i, letter in enumerate(np.squeeze(w_copy[unique_mask.index[unique_mask]].values)):
                    wc['scores'] = wc['scores'] + (total_letters - i) * wc.word.str.contains(letter).astype(int)
                    wc['counts'] = wc['counts'] + wc.word.str.contains(letter).astype(int)
                    
                print(f'\nWords with only one letter differential detected. Consider filtering:')
                display(wc.loc[wc.counts.le(total_letters // 2 * 3)].sort_values('scores', ascending=False).head(5))

                
        step += 1
        print()
        
    return w, mask, res

## Run

In [35]:
global_list = []
mask = [True] * wordle.shape[0]
for word in tqdm(wordle.word):
    global_list.append(get_final_scores(word, mask))

  0%|          | 0/12972 [00:00<?, ?it/s]

NameError: name 'step' is not defined

In [40]:
dfg = pd.DataFrame({'word': wordle.word, 'score': global_list}) \
    .merge(words_all[['word', 'word_freq']], on='word', how='left') \
    .fillna(0) \
    .sort_values(['score', 'word_freq'], ascending=False)
dfg.to_csv('results/all_scores.csv', index=False)

In [41]:
dfg.head(10)

Unnamed: 0,word,score,word_freq
9100,tares,2.65757,12.0
4998,lares,2.645467,296.0
7336,rales,2.622418,28.0
6063,nares,2.600833,253.0
7381,rates,2.598289,9301.0
2097,dares,2.590888,226.0
1423,cares,2.580019,708.0
6602,pares,2.572464,175.0
9061,tales,2.569303,11745.0
5513,mares,2.558356,1000.0


In [None]:
curr_wordset, curr_mask, curr_res = run_app('soare')

[ ---- STEP 1 ----]


Input feedback: xxxyy


Found 401 candidates. Running analysis...


  0%|          | 0/401 [00:00<?, ?it/s]

Suggestions for step 2:


Unnamed: 0,word,score,word_freq
353,cider,4.309227,1093.0
314,diner,4.301746,1125.0
33,dicer,4.279302,62.0
237,tiler,4.274314,37.0
35,dimer,4.264339,390.0
118,liter,4.259352,431.0
17,citer,4.254364,10.0
2,bider,4.254364,9.0
328,diver,4.244389,3058.0
145,niter,4.229426,49.0



[ ---- STEP 2 ----]


Input a guess: cider
Input feedback: xxxyy


Found 32 candidates. Running analysis...


  0%|          | 0/32 [00:00<?, ?it/s]

Suggestions for step 3:


Unnamed: 0,word,score,word_freq
3,ferny,4.9375,95.0
26,perky,4.90625,95.0
11,pervy,4.78125,15.0
10,nervy,4.75,23.0
5,herby,4.6875,90.0
2,ferly,4.6875,0.0
15,rebuy,4.65625,12.0
30,jerky,4.59375,236.0
29,reply,4.53125,1250.0
4,germy,4.5,0.0



[ ---- STEP 3 ----]


Input a guess: ferny
Input feedback: xgyxx


Found 2 candidates. Running analysis...


  0%|          | 0/2 [00:00<?, ?it/s]

Suggestions for step 4:


Unnamed: 0,word,score,word_freq
1,rebut,7.5,53.0
0,rewth,7.5,0.0


Small number of candidates remaining (2). We recommend choosing the most popular option:


Unnamed: 0,word,score,word_freq
1,rebut,7.5,53.0
0,rewth,7.5,0.0



[ ---- STEP 4 ----]
