# Step 2: Filter Again

## Imports

In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from joblib import Parallel, delayed

sns.set()
P = '#7B73F0'
G = '#27DDCB'

## Load Words

In [2]:
with open('data/wordle-candidates.json', 'r') as file:
    wordle_candidates = json.load(file)
    
with open('data/wordle-answers.json', 'r') as file:
    wordle_answers = json.load(file)

wordle_candidates = pd.DataFrame(wordle_candidates['words'], columns=['word'])
wordle_answers = pd.DataFrame(wordle_answers['words'], columns=['word'])
wordle_candidates['is_answer'] = 0
wordle_answers['is_answer'] = 1
wordle = wordle_candidates.append(wordle_answers).reset_index(drop=True)

In [3]:
search = pd.read_csv('results/step1_combined.csv')

## Game Logic

In [4]:
def get_feedback(input_word, solution):
    output = ''
    for i in range(5):
        if input_word[i] == solution[i]:
            output += 'G'
        elif input_word[i] in solution:
            output += 'Y'
        else:
            output += 'X'
    return output

In [5]:
def filter_wordset(input_word, feedback, wordset):
    newset = wordset.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newset = newset.loc[newset.word.str[i] == input_word[i]]
        elif feedback[i] == 'Y':
            # newset = newset.loc[newset.word.str.contains(input_word[i])]
            newset = newset.loc[newset.word.str.contains(input_word[i]) & newset.word.apply(lambda x: x[i] != input_word[i])]
        else:
            newset = newset.loc[~newset.word.str.contains(input_word[i])]
    return newset

## Metrics

In [6]:
def update_space(candidate_word, fb, space):
    s = space.copy()
    for i, l in enumerate(candidate_word):
        row = alpha_dict[l]
        # Green: all zeros except cell
        if fb[i] == 'G':
            s[:, i] = 0
            s[row, i] = 1
        elif fb[i] == 'X':
            s[row, :] = 0
        elif fb[i] == 'Y':
            s[row, i] = 0
    return s

In [7]:
def calc_improvement(candidate_word, solution_word):
    fb = get_feedback(candidate_word, solution_word)
    space = np.ones((26,5))
    curr_score = 1.0
    updated_space = update_space(candidate_word, fb, space)
    new_score = updated_space.mean()
    return curr_score - new_score

In [39]:
def compute_n_filtered(first_word, solution, second_word, wordset=wordle):
    fb1 = get_feedback(first_word, solution)
    candidates = filter_wordset(first_word, fb1, wordset)
    size1 = candidates.shape[0]
    fb2 = get_feedback(second_word, solution)
    candidates = filter_wordset(second_word, fb2, candidates)
    
    return first_word, second_word, solution, size1, \
            candidates.shape[0], fb1, fb2

In [9]:
alpha_dict = {l:i  for i, l in enumerate(list('abcdefghijklmnopqrstuvwxyz'))}

## Global LF

In [40]:
def compute_letter_frequencies(wordset):
    w = wordset.copy()
    for letter in list('abcdefghijklmnopqrstuvwxyz'):
        w[letter] = w.word.str.contains(letter).astype(int)
    return w.iloc[:, 1:]

def compute_score(x, freqs):
    letters = set(x)
    output = 0
    for letter in letters:
        output += freqs[letter]
    return output

def global_lf_scorer(wordset):
    # Compute letter distribution of updated wordset
    wordset_letterdist = compute_letter_frequencies(wordset)
    freqs = wordset_letterdist.sum().to_dict()

    # Obtain scores
    scores = wordset.word.apply(compute_score, freqs=freqs)
    scores = pd.DataFrame({'word': wordset.word, 'score': scores}).sort_values('score', ascending=False)
    
    return scores

def get_next5(input_word, solution, scorer):
    feedback = get_feedback(input_word, solution)
    candidates = filter_wordset(input_word, feedback, wordle)
    next5 = scorer(candidates).iloc[:5].word.tolist()
    return input_word, solution, next5

In [41]:
def find_second_word(word, scores, unique_letters=False, two_vowel=False):
    letters = list(word)
    candidates = scores.loc[scores.word.apply(lambda x: all([l not in letters for l in x]))]
    if unique_letters:
        candidates = candidates.loc[candidates.word.apply(lambda x: len(x) == len(set(x)))]
    if two_vowel:
        candidates = candidates.loc[
            candidates.word.apply(lambda x: x.count('a') + x.count('e') + \
                                  x.count('i') + x.count('o') + x.count('u') >= 2)
        ]
    candidates = candidates.groupby('score').first().sort_index(ascending=False)
    return word, candidates.head(5).word.tolist()

In [42]:
global_freqs = compute_letter_frequencies(wordle).sum().to_dict()
global_scores = wordle.word.apply(compute_score, freqs=global_freqs)
global_scores = pd.DataFrame({'word': wordle.word, 'score': global_scores}).sort_values('score', ascending=False)

In [43]:
lookups_glf = Parallel(n_jobs=5, verbose=1)(delayed(find_second_word)(input_word, global_scores) \
                                        for input_word in search.word)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  65 tasks      | elapsed:    0.5s
[Parallel(n_jobs=5)]: Done 100 out of 100 | elapsed:    0.6s finished


In [44]:
df_lookups_glf = pd.DataFrame(lookups_glf, columns=['first_word', 'second_word'])
df_lookups_glf = df_lookups_glf.explode('second_word')
df_lookups_glf.head()

Unnamed: 0,first_word,second_word
0,soare,unlit
0,soare,linty
0,soare,clint
0,soare,unlid
0,soare,culti


In [45]:
raw_glf = Parallel(n_jobs=5, verbose=1)(delayed(compute_n_filtered)(row.first_word, solution, row.second_word) \
                                   for _, row in df_lookups_glf.iterrows() \
                                   for solution in wordle_answers.word)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done 150 tasks      | elapsed:    0.6s
[Parallel(n_jobs=5)]: Done 1350 tasks      | elapsed:    4.6s
[Parallel(n_jobs=5)]: Done 3350 tasks      | elapsed:   10.8s
[Parallel(n_jobs=5)]: Done 6150 tasks      | elapsed:   20.1s
[Parallel(n_jobs=5)]: Done 9750 tasks      | elapsed:   32.2s
[Parallel(n_jobs=5)]: Done 14150 tasks      | elapsed:   47.7s
[Parallel(n_jobs=5)]: Done 19350 tasks      | elapsed:  1.1min
[Parallel(n_jobs=5)]: Done 25350 tasks      | elapsed:  1.5min
[Parallel(n_jobs=5)]: Done 32150 tasks      | elapsed:  1.9min
[Parallel(n_jobs=5)]: Done 39750 tasks      | elapsed:  2.3min
[Parallel(n_jobs=5)]: Done 48150 tasks      | elapsed:  2.8min
[Parallel(n_jobs=5)]: Done 57350 tasks      | elapsed:  3.4min
[Parallel(n_jobs=5)]: Done 67350 tasks      | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done 78150 tasks      | elapsed:  4.7min
[Parallel(n_jobs=5)]: Done 89750 tasks      | ela

In [46]:
df_glf = pd.DataFrame(raw_glf, columns=['first_word', 'second_word', 'solution', 'size1', 'size2', 'fb1', 'fb2'])

In [47]:
df_glf['dec1'] = 1 - df_glf['size1'] / 12972
df_glf['dec12'] = 1 - df_glf['size2'] / df_glf['size1']
df_glf['dec02'] = 1 - df_glf['size2'] / 12972
df_glf.to_csv('results/step2_pre_glf.csv', index=False)

In [48]:
results_glf = df_glf.groupby(['first_word', 'second_word'])[['dec1', 'dec12', 'dec02', 'size2']].mean().reset_index()
display(results_glf[['dec1', 'dec12', 'dec02']].corr())
results_glf.sort_values('dec02', ascending=False).head(10)

Unnamed: 0,dec1,dec12,dec02
dec1,1.0,-0.30311,0.276541
dec12,-0.30311,1.0,0.63856
dec02,0.276541,0.63856,1.0


Unnamed: 0,first_word,second_word,dec1,dec12,dec02,size2
465,toles,cairn,0.972145,0.908719,0.998767,15.989201
220,nares,doilt,0.977323,0.902825,0.998753,16.17797
95,cares,doilt,0.975003,0.901339,0.998729,16.486825
94,canes,triol,0.970929,0.923215,0.998729,16.493737
471,tones,drail,0.969614,0.915592,0.998725,16.533045
176,lanes,droit,0.974961,0.903988,0.998722,16.58013
425,tales,corni,0.976049,0.887427,0.998715,16.67041
269,rales,tonic,0.978258,0.887163,0.998702,16.842333
290,reans,doilt,0.975219,0.898053,0.99869,16.991361
184,lares,tonic,0.978574,0.886516,0.998688,17.012959


## Positional LF

In [180]:
def compute_pos_letter_freq(wordset):
    pos_scores = {}
    pos_scores[0] = wordset.word.str[0].value_counts().to_dict()
    pos_scores[1] = wordset.word.str[1].value_counts().to_dict()
    pos_scores[2] = wordset.word.str[2].value_counts().to_dict()
    pos_scores[3] = wordset.word.str[3].value_counts().to_dict()
    pos_scores[4] = wordset.word.str[4].value_counts().to_dict()
    
    return pos_scores

In [181]:
def compute_pos_score(letters, pos_scores):
    output = 0
    for i, letter in enumerate(letters):
        output += pos_scores[i].get(letter, 0)
    return output

In [182]:
pos_scores = compute_pos_letter_freq(wordle)
pos_lf_scores = wordle.word.apply(compute_pos_score, pos_scores=pos_scores)
pos_lf_scores = pd.DataFrame({'word': wordle.word, 'score': pos_lf_scores}).sort_values('score', ascending=False)

In [187]:
lookups_plf = Parallel(n_jobs=5, verbose=1)(delayed(find_second_word)(input_word, pos_lf_scores) \
                                            for input_word in search.word[:20])

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  20 out of  20 | elapsed:    0.1s finished


In [188]:
df_lookups_plf = pd.DataFrame(lookups_plf, columns=['first_word', 'second_word'])
df_lookups_plf = df_lookups_plf.explode('second_word')
df_lookups_plf.head()

Unnamed: 0,first_word,second_word
0,soare,tinty
0,soare,pinny
0,soare,bunty
0,soare,tinny
0,soare,minty


In [192]:
raw_plf = Parallel(n_jobs=5, verbose=1)(delayed(compute_n_filtered)(row.first_word, solution, row.second_word) \
                                   for _, row in df_lookups_plf.iterrows() \
                                   for solution in wordle_answers.word)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done 150 tasks      | elapsed:    0.5s
[Parallel(n_jobs=5)]: Done 2550 tasks      | elapsed:    8.0s
[Parallel(n_jobs=5)]: Done 6550 tasks      | elapsed:   21.3s
[Parallel(n_jobs=5)]: Done 12150 tasks      | elapsed:   40.6s
[Parallel(n_jobs=5)]: Done 19350 tasks      | elapsed:  1.1min
[Parallel(n_jobs=5)]: Done 28150 tasks      | elapsed:  1.7min
[Parallel(n_jobs=5)]: Done 38550 tasks      | elapsed:  2.3min
[Parallel(n_jobs=5)]: Done 50550 tasks      | elapsed:  3.1min
[Parallel(n_jobs=5)]: Done 64150 tasks      | elapsed:  3.9min
[Parallel(n_jobs=5)]: Done 79350 tasks      | elapsed:  4.9min
[Parallel(n_jobs=5)]: Done 96150 tasks      | elapsed:  5.9min
[Parallel(n_jobs=5)]: Done 114550 tasks      | elapsed:  7.1min
[Parallel(n_jobs=5)]: Done 134550 tasks      | elapsed:  8.3min
[Parallel(n_jobs=5)]: Done 156150 tasks      | elapsed:  9.6min
[Parallel(n_jobs=5)]: Done 179350 tasks     

In [193]:
df_plf = pd.DataFrame(raw_plf, columns=['first_word', 'second_word', 'solution', 'size1', 'size2', 'fb1', 'fb2'])

In [194]:
df_plf['dec1'] = 1 - df_plf['size1'] / 12972
df_plf['dec12'] = 1 - df_plf['size2'] / df_plf['size1']
df_plf['dec02'] = 1 - df_plf['size2'] / 12972

In [195]:
results_plf = df_plf.groupby(['first_word', 'second_word'])[['dec1', 'dec12', 'dec02', 'size2']].mean().reset_index()
display(results_plf[['dec1', 'dec12', 'dec02']].corr())
results_plf.sort_values('dec02', ascending=False).head(10)

Unnamed: 0,dec1,dec12,dec02
dec1,1.0,-0.07677,0.065268
dec12,-0.07677,1.0,0.953956
dec02,0.065268,0.953956,1.0


Unnamed: 0,first_word,second_word,dec1,dec12,dec02,size2
88,tares,doily,0.978098,0.865692,0.99857,18.552484
82,tales,corny,0.976049,0.878695,0.99854,18.938661
48,rates,doily,0.977445,0.868179,0.998535,19.009503
62,salet,corny,0.975465,0.872161,0.998525,19.136933
93,tears,doily,0.975517,0.863991,0.998495,19.521814
7,aloes,dirty,0.975959,0.885618,0.998456,20.02635
98,teras,doily,0.975572,0.869453,0.998449,20.123542
84,tales,porny,0.976049,0.877232,0.998397,20.793521
13,arles,monty,0.978113,0.862359,0.998388,20.907991
14,arles,ponty,0.978113,0.870153,0.998379,21.027214


In [197]:
search

Unnamed: 0,word,decrease,space_improvement,glf,plf
0,soare,0.978831,0.245669,1.0,0.0
1,lares,0.978574,0.202186,1.0,1.0
2,rales,0.978258,0.199811,1.0,1.0
3,arles,0.978113,0.199595,1.0,0.0
4,tares,0.978098,0.206343,1.0,1.0
...,...,...,...,...,...
95,tames,0.968658,0.205373,0.0,1.0
96,pales,0.968576,0.208267,0.0,1.0
97,elans,0.968556,0.198146,1.0,0.0
98,spaer,0.968528,0.231570,1.0,0.0
