# Full Vectorisation

In [1]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
import time

from joblib import Parallel, delayed

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Load Data

In [3]:
# Load candidates
with open('data/wordle-candidates.json', 'r') as file:
    wordle_candidates = json.load(file)

# Load answers
with open('data/wordle-answers.json', 'r') as file:
    wordle_answers = json.load(file)

wordle_candidates = pd.DataFrame(wordle_candidates['words'], columns=['word'])
wordle_answers = pd.DataFrame(wordle_answers['words'], columns=['word'])
wordle_candidates['is_answer'] = 0
wordle_answers['is_answer'] = 1
wordle = wordle_candidates.append(wordle_answers).reset_index(drop=True)

In [4]:
# Load all words
words_all = pd.read_table('data/archive/en_words_1_5-5.txt', delimiter=' ', header=None, index_col=None,
                         names=['word_len', 'word_freq', 'n_articles']).reset_index()
words_all = words_all.rename(columns={'index': 'word'})

# Filter by english
alphabet = list('abcdefghijklmnopqrstuvwxyz')
words_all = words_all.loc[words_all.word.apply(lambda x: all([l in alphabet for l in x]))].reset_index(drop=True)

## Prepare Artifacts

In [5]:
alpha_dict = {l: i for i, l in enumerate(list('abcdefghijklmnopqrstuvwxyz'))}

In [6]:
# Initialise solutions vector
solutions = np.zeros((wordle_answers.shape[0], 26, 5), dtype='int8')
for i, word in enumerate(wordle_answers.word):
    for j, l in enumerate(word):
        solutions[i, alpha_dict[l], j] = 1
        
        
# Initialise candidates vector
candidates = np.zeros((wordle.shape[0], 26, 5), dtype='int8')
for i, word in enumerate(wordle.word):
    for j, l in enumerate(word):
        candidates[i, alpha_dict[l], j] = 1

## Game Logic

In [7]:
def get_feedback(input_word, solution):
    output = ''
    for i in range(5):
        if input_word[i] == solution[i]:
            output += 'G'
        elif input_word[i] in solution:
            output += 'Y'
        else:
            output += 'X'
    return output

In [8]:
def filter_wordset(input_word, feedback, wordset):
    newset = wordset.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newset = newset.loc[newset.word.str[i] == input_word[i]]
        elif feedback[i] == 'Y':
            newset = newset.loc[newset.word.str.contains(input_word[i]) & newset.word.apply(lambda x: x[i] != input_word[i])]
        else:
            newset = newset.loc[~newset.word.str.contains(input_word[i])]
    return newset

## Vector Ops

In [9]:
def init_vec(word):
    mat = np.zeros((26, 5))
    for i, l in enumerate(word):
        mat[alpha_dict[l], i] = 1
    return mat

In [10]:
def get_scores(word, mask):
    word_vec = init_vec(word)
    solutions_masked = solutions[mask]
    greens = solutions_masked * word_vec
    yellows = word_vec * (
        (solutions_masked.sum(axis=2) >= word_vec.sum(axis=1)) & 
        (word_vec.sum(axis=1) > 0)) \
        .reshape(np.sum(mask), 26, 1) - greens
    greys = word_vec - greens - yellows
    scores = np.array([np.sum(greens, axis=(1,2)), np.sum(yellows, axis=(1,2)), np.sum(greys, axis=(1,2))]).T
        
    return scores

In [11]:
def get_final_scores(word, mask):
    scores = get_scores(word, mask)
    df_scores = pd.DataFrame(scores, columns=['g', 'y', 'x'])
    df_scores['score'] = df_scores.g * 2 + df_scores.y
    
    return df_scores.score.mean()

In [12]:
def get_n_cands(word, filter_mask, candidate_mask):
    return np.sum((np.sum((filter_mask * solutions[candidate_mask]) == solutions[candidate_mask], axis=(-2,-1)) == 130))

### Filter Mask

In [13]:
def init_candidate_mask():
    return np.array([True] * wordle.shape[0])

In [14]:
def init_filter_mask():
    filter_mask = np.ones((26,5))
    return filter_mask

In [15]:
def update_filter_mask(input_word, feedback, mask):
    wv = init_vec(input_word)
    row_idx = [alpha_dict[l] for l in input_word]
    output = mask.copy()
    for i, (fb, r) in enumerate(zip(feedback, row_idx)):
        # Green
        if fb == 'G':
            output[:, i] = 0
            output[r, i] = 1
        # Yellow
        elif fb == 'Y':
            output[r, i] = 0
        # Grey
        elif fb == 'X':
            output[r, :] = 0
    return output

In [16]:
def update_candidate_mask(input_word, feedback, wordset, mask):
    newmask = mask.copy()
    for i in range(5):
        if feedback[i] == 'G':
            newmask[~wordset.word.str[i].eq(input_word[i])] = False
        elif feedback[i] == 'Y':
            newmask[~(wordset.word.str.contains(input_word[i]) & wordset.word.apply(lambda x: x[i] != input_word[i]))] = False
        elif feedback[i] == 'X':
            newmask[wordset.word.str.contains(input_word[i])] = False
            
    return newmask

## Testing

### Function to Generate G/Y/X Counts
```py
# Compute greens, yellows, and greys
greens = candidates * word_vec
yellows = word_vec * (
    (candidates.sum(axis=2) >= word_vec.sum(axis=1)) & 
    (word_vec.sum(axis=1) > 0)) \
    .reshape(12972, 26, 1) - greens
greys = word_vec - greens - yellows

# Set up GYX tensor
gyx_reshaped = np.stack([greens, yellows, greys], axis=1)
gyx_reshaped = gyx_reshaped.reshape(12972, 390)

# Compute raw candidate data
ncands = np.apply_along_axis(compute_cands, 1, gyx_reshaped)
ncands_max = np.max(ncands)
ncands_mean = np.mean(ncands)
```

In [55]:
input_word = 'saine'
iv = init_vec(input_word)
iv_tensor = tf.constant(iv, dtype='int8')
candidates_tensor = tf.constant(candidates, dtype='int8')
solutions_tensor = tf.constant(solutions, dtype='int8')

In [56]:
# Green locations
greens_tensor = iv_tensor * candidates_tensor

# Yellow locations
yellows_tensor = iv_tensor * (
    tf.reshape(
        tf.math.multiply(
            tf.cast(
                tf.math.greater_equal(tf.reduce_sum(candidates_tensor, axis=-1),tf.reduce_sum(iv_tensor, axis=-1)),
                dtype='int8'
            ),
            tf.cast(
                tf.math.greater(tf.reduce_sum(iv_tensor, axis=-1), 0),
                dtype='int8'
            )
        ), shape=(12972, 26,1)
    ) - \
    greens_tensor
)

# Grey locations
greys_tensor = iv_tensor - greens_tensor - yellows_tensor

# Set up tensor
gyx_reshaped = tf.stack([greens_tensor, yellows_tensor, greys_tensor], axis=1)
gyx_reshaped = tf.reshape(gyx_reshaped, (12972, 390))

In [57]:
# Function to check GYX against all solutions
def compute_cands(gyx_triplet):
    gyx = tf.reshape(gyx_triplet, (3, 26, 5))

    # Green checks
    green_boolean = tf.reduce_sum(
        tf.cast((gyx[0] * solutions_tensor) == gyx[0], dtype='int8'),
        axis=(-2,-1)
    ) == 130

    # Yellow avoid: All yellow locations are zero
    yellow_avoid = tf.reduce_sum(
        tf.cast(gyx[1] * solutions_tensor == 0, dtype='int8'),
        axis=(-2, -1)
    ) == 130

    # Yellow present: 
    # 1. Compute row sums for yellow vector
    # 2. Select rows with at least one yellow in each solution word vector
    # 3. Compute row sums for solution vector to check there are at least one
    # 4. Check that there are two
    yellow_sums = tf.reduce_sum(gyx[1], axis=-1)
    yellow_present = tf.reduce_sum(
        tf.cast(tf.reduce_sum(
            tf.cast(solutions[:, yellow_sums >= 1, :], dtype='int8'),
            axis=-1
        ) >= 1, dtype='int8'),
        axis=-1
    ) == 2

    # Combine yellow checks
    yellow_boolean = tf.math.logical_and(yellow_present, yellow_avoid)

    # Grey checks
    grey_boolean = tf.reduce_sum(
        tf.cast(tf.reduce_sum(gyx[2], axis=-1, keepdims=True) * solutions == 0, dtype='int8'),
        axis=(-2,-1)
    ) == 130

    # Count no. of candidates
    green_boolean = tf.cast(green_boolean, dtype='int8')
    yellow_boolean = tf.cast(yellow_boolean, dtype='int8')
    grey_boolean = tf.cast(grey_boolean, dtype='int8')
    combined_boolean = green_boolean * yellow_boolean * grey_boolean

    return tf.reduce_sum(combined_boolean)

In [None]:
t0 = time.time()
gyx_dataset = tf.data.Dataset.from_tensor_slices(gyx_reshaped)
batched_gyx = gyx_dataset.batch(100)
ncands = []
counter = 0
for batch in batched_gyx:
    if counter % 10 == 0:
        print(f'Running batch {counter} - {time.time()-t0}')
    batch_ncands = [compute_cands(triplet) for triplet in batch]
    ncands += batch_ncands
    counter += 1

Running batch 0 - 0.12418174743652344
Running batch 10 - 20.804954767227173
Running batch 20 - 44.57127046585083


In [54]:
t0 = time.time()
gyx_dataset = tf.data.Dataset.from_tensor_slices(gyx_reshaped)
batched_gyx = gyx_dataset.batch(100)
ncands = []
counter = 0
for batch in batched_gyx:
    if counter % 10 == 0:
        print(f'Running batch {counter} - {time.time()-t0}')
    batch_ncands = [compute_cands(triplet) for triplet in batch]
    ncands += batch_ncands
    counter += 1

Running batch 0 - 0.010282278060913086
Running batch 10 - 12.888877153396606
Running batch 20 - 25.471890687942505
Running batch 30 - 38.16497611999512
Running batch 40 - 50.7547082901001
Running batch 50 - 63.46690559387207
Running batch 60 - 76.18823003768921
Running batch 70 - 88.89395046234131
Running batch 80 - 101.49941563606262
Running batch 90 - 113.98473882675171
Running batch 100 - 126.68144369125366
Running batch 110 - 139.22911143302917
Running batch 120 - 151.6520116329193


In [45]:
np.mean(ncands), np.max(ncands)

(17.039161270428615, 288)

In [35]:
tf.reduce_mean(tf.cast(ncands_tensor, dtype='float32'))

<tf.Tensor: shape=(), dtype=float32, numpy=6.6>

In [35]:
tf.reduce_max(tf.cast(ncands_tensor, dtype='float32'))

<tf.Tensor: shape=(), dtype=float32, numpy=6.6>

In [432]:
t0 = time.time()
scores = []
batched_gyx = gyx_dataset.batch(100)
counter = 0
for batch in batched_gyx:
    if counter % 10 == 0:
        
    scores_batch = [compute_gyx(x, solutions_subset_tensor) for x in batch]
    scores += scores_batch
    counter += 1
    
t1 = time.time()
print(t1-t0)

Running batch 0 - 0.0039823055267333984
4.7484471797943115


In [505]:
tf.reshape(greys_pos_tensor)

<tf.Tensor: shape=(26,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0], dtype=int32)>

<tf.Tensor: shape=(1, 26, 5), dtype=int32, numpy=
array([[[1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 1, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 1, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1],
        [1, 1, 0, 0, 1]]], dtype=int32)>

In [794]:
tf.expand_dims(tf.squeeze(tf.where(greens_pos_tensor==1)), 0)

<tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[2, 3]])>

In [815]:
k=3
greens_tensor = gyx_tensor[k][0]
yellows_tensor = gyx_tensor[k][1]
greys_tensor = gyx_tensor[k][2]    

greens_pos_tensor = tf.reduce_sum(greens_tensor, axis=-2)
yellows_pos_tensor = tf.reduce_sum(yellows_tensor, axis=-1)
greys_pos_tensor = tf.reduce_sum(greys_tensor, axis=-1)

global filter_mask_tensor
filter_mask_tensor = tf.Variable(tf.ones((26,5), dtype='int32'))

# Update greens
green_totals = tf.squeeze(tf.where(greens_pos_tensor==1))
for i in green_totals.numpy():
    filter_mask_tensor = tf.cond(
        tf.greater(i, 0),
        lambda: filter_mask_tensor[:, i].assign(greens_tensor[:, i]),
        lambda: filter_mask_tensor[: i]
    )

# Update yellows
filter_mask_tensor = tf.cond(
    tf.greater(tf.reduce_sum(yellows_pos_tensor), 0),
    lambda: tf.Variable(tf.where(
        tf.reshape(yellows_pos_tensor, (26, 1)) == 1, 0 + yellows_tensor, filter_mask_tensor
    )),
    lambda: filter_mask_tensor
)


ok: 2
ok: 3


In [805]:
for i in green_totals.numpy():
    print(i)

2
3


In [812]:
greens_tensor[:, 3]

<tf.Tensor: shape=(26,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0], dtype=int32)>