In [62]:
from FlagEmbedding import FlagModel
import pickle
from scipy.spatial import KDTree
import random
import numpy as np
import itertools

In [2]:
def generate_n_cont_set(word, n):
    n_letters_word = set()
    for i in range(len(word) - n + 1):
        n_letters_word.add(word[i:i+n])
    return n_letters_word

In [3]:
def shares_n_cont_letters(a: str, b: str, n: int):
    n_letters_a = set()
    for i in range(len(a) - n + 1):
        n_letters_a.add(a[i:i+n])

    for i in range(len(b) - n + 1):
        if b[i:i+n] in n_letters_a:
            return True
    return False

In [4]:
def rotate_grid(grid):
    return [[tuple(item) for item in row] for row in np.rot90(np.array(grid))]

In [5]:
model = FlagModel('BAAI/bge-small-en-v1.5',
                  query_instruction_for_retrieval="Generate a representation for this word for retrieving related words:",
                  use_fp16=True)

In [6]:
with open("google-10000-english-no-swears.txt") as f:
    all_words = f.readlines()

In [7]:
len(all_words)

9894

In [8]:
words = []
for word in all_words:
    word_filtered = word.strip()
    if len(word_filtered) > 3:
        words.append(word_filtered)

In [20]:
embeddings = model.encode(words)

Inference Embeddings: 100%|██████████| 35/35 [00:10<00:00,  3.30it/s]


In [9]:
lens = {}
for word in words:
    word_len = len(word)
    if word_len not in lens:
        lens[word_len] = 1
    else:
        lens[word_len] += 1

In [10]:
lens

{4: 1100,
 5: 1367,
 6: 1491,
 11: 374,
 7: 1449,
 8: 1157,
 9: 904,
 10: 608,
 13: 101,
 12: 207,
 14: 39,
 15: 10,
 18: 1,
 16: 2}

In [11]:
# with open("embeddings.pkl", "wb+") as f:
#     pickle.dump(embeddings, f)
with open("embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

In [12]:
embeddings.shape

(8810, 384)

In [13]:
tree = KDTree(embeddings)

In [14]:
query = "Photography"

In [15]:
query_embedding = model.encode(query)

In [16]:
# Get the 1000 most relevant words/phrases
dd, ii = tree.query(query_embedding, k=1000)

In [17]:
# Choose spangrammables
spangrammable = []
for i in ii:
    candidate = words[i].strip()
    candidate_alpha_only = "".join([char for char in candidate if char.isalpha()])
    if 6 <= len(candidate_alpha_only) <= 13:
        spangrammable.append(candidate)

In [42]:
spangram = random.choice(spangrammable[:100])
spangram

'recordings'

In [43]:
subembeddings = [embeddings[i] for i in ii]
subwords = [words[i] for i in ii]
subtree = KDTree(subembeddings)

In [44]:
# Get words/phrases the most relevant to spangram
ddd, iii = subtree.query(model.encode(spangram), k=250)

In [45]:
candidates: list[str] = []
n = 4
candidate_n_cont_pool: set[str] = set()
for i in iii:
    word = subwords[i].strip().lower()
    word_set = generate_n_cont_set(word, n)
    unique = True
    for item in word_set:
        if item in candidate_n_cont_pool:
            unique = False
            break
    if unique:
        candidates.append(word)
        candidate_n_cont_pool.update(word_set)

In [46]:
candidates

['recordings',
 'audio',
 'tapes',
 'footage',
 'voices',
 'albums',
 'samples',
 'videos',
 'listening',
 'documents',
 'songs',
 'photographs',
 'clips',
 'music',
 'vids',
 'sound',
 'sessions',
 'cameras',
 'performing',
 'tracks',
 'stories',
 'reproduced',
 'gallery',
 'files',
 'media',
 'singing',
 'collection',
 'press',
 'acoustic',
 'streams',
 'pictures',
 'papers',
 'lessons',
 'images',
 'microphone',
 'experiences',
 'highlights',
 'journals',
 'reporters',
 'artists',
 'outputs',
 'speech',
 'diary',
 'observe',
 'techniques',
 'source',
 'scenes',
 'livecam',
 'uploaded',
 'browse',
 'vocal',
 'mpegs',
 'films',
 'snapshot',
 'noise',
 'watching',
 'slides',
 'during',
 'oral',
 'webcams',
 'poems',
 'show',
 'tour',
 'exhibits',
 'prints',
 'arrivals',
 'labels',
 'recent',
 'thumbnails',
 'stereo',
 'fotos',
 'these',
 'examines',
 'surveillance',
 'from',
 'data',
 'library',
 'pulse',
 'accessed',
 'draws',
 'practice',
 'texts',
 'remarks',
 'memory',
 'viewer',
 

In [50]:
# Separate the candidates into plain words and phrases
plain_words = []
phrases = []
for candidate in candidates[1:]:
    if len(candidate) <= 3:
        continue
    if candidate.isalpha():
        plain_words.append(candidate)
    else:
        phrases.append(candidate)

In [51]:
plain_lengths = [len(plain_word) for plain_word in plain_words]

In [52]:
spangram = "".join([char for char in spangram if char.isalpha()])
spangram

'recordings'

In [53]:
max_words = (48 - len(spangram))//4
max_words

9

In [54]:
num_words = random.choice(list(range(6, max_words + 1)))

In [55]:
num_words

6

In [56]:
budget = 48 - len(spangram)

In [57]:
chosen = []
plain_lengths_cpy = plain_lengths[:]
while sum(chosen) != budget:
    remaining = budget - sum(chosen)
    if sum(chosen) < budget:
        if remaining in plain_lengths_cpy:
            chosen.append(remaining)
            break
        else:
            new_chosen = random.choice(plain_lengths_cpy)
            plain_lengths_cpy.remove(new_chosen)
            chosen.append(new_chosen)
    elif sum(chosen) > budget:
        if (-remaining) in chosen:
            chosen.remove(-remaining)
            break
        else:
            for i in range(random.choice([1,1,1,2,2,3])):
                to_remove = random.choice(chosen)
                chosen.remove(to_remove)
                plain_lengths_cpy.append(to_remove)


In [58]:
chosen

[8, 7, 12, 5, 6]

In [59]:
word_lens: dict[int, list[str]] = {}
for word in plain_words:
    word_len = len(word)
    if word_len not in word_lens:
        word_lens[word_len] = [word]
    else:
        word_lens[word_len].append(word)

chosen_words = [spangram]
for length in chosen:
    chosen_word = random.choice(word_lens[length])
    word_lens[length].remove(chosen_word)
    chosen_words.append(chosen_word)

In [60]:
chosen_words

['recordings', 'arrivals', 'footage', 'transparency', 'pulse', 'slides']

In [34]:
if len(spangram) < 8:
    spangram_direction = "ltr"
else:
    spangram_direction = random.choice(["ltr", "ttb"])

In [35]:
spangram_direction

'ltr'

In [36]:
# METHOD 1 - COIL
# grid = []
# for i in range(8):
#     row = []
#     for j in range(6):
#         row.append((i,j))
#     grid.append(row)

# if spangram_direction == "ttb":
#     grid = rotate_grid(grid)
# coord_lst = grid.pop(0)
# while len(grid) > 0:
#     grid = rotate_grid(grid)
#     coord_lst.extend(grid.pop(0))

In [145]:
# METHOD 2 - SNAKE
grid = []
for i in range(8):
    row = []
    for j in range(6):
        row.append((i,j))
    grid.append(row)

if spangram_direction == "ttb":
    grid = rotate_grid(grid)
coord_lst = grid.pop(0)
row = 1
while len(grid) > 0:
    if row % 2 == 0:
        coord_lst.extend(grid.pop(0))
    else:
        coord_lst.extend(reversed(grid.pop(0)))
    row += 1

In [146]:
non_spangrams = chosen_words[:]
non_spangrams.remove(spangram)
valid_combos = []
for n in range(len(non_spangrams) + 1):
    for combo in itertools.combinations(non_spangrams, n):
        prefix_len = sum([len(item) for item in combo])
        if spangram_direction == "ltr":
            dir_len = 6
        else:
            dir_len = 8
        row_prefix_len = prefix_len % dir_len
        if (row_prefix_len == 0) or ((row_prefix_len + len(spangram)) >= (2 * dir_len)):
            valid_combos.append(combo)

chosen_combo = list(random.choice(valid_combos[1:-1]))
random.shuffle(chosen_combo)
for word in chosen_combo:
    non_spangrams.remove(word)
random.shuffle(non_spangrams)
chosen_words = chosen_combo + [spangram] + non_spangrams
spangram_idx = chosen_words.index(spangram)

In [147]:
coords: list[list[tuple[int,int]]] = []

for word in chosen_words:
    word_coords = []
    for char in word:
        word_coords.append(coord_lst.pop(0))
    if random.choice([True, False]):
        coords.append(word_coords)
    else:
        coords.append(list(reversed(word_coords)))

In [148]:
def get_word_letter_idx(coords, letter_coords, words):
    for word_idx, word_coords in enumerate(coords):
        if letter_coords in word_coords:
            letter_idx = word_coords.index(letter_coords)
            word = words[word_idx]
            letter = word[letter_idx]
            return letter_idx, letter, word_idx, word

In [149]:
def check_word_continuity(word_coords:list[list[tuple[int,int]]]):
    for coords_a, coords_b in zip(word_coords[:-1], word_coords[1:]):
        if abs(coords_a[0] - coords_b[0]) > 1 or abs(coords_a[1] - coords_b[1]) > 1:
            return False
    return True

In [150]:
# METHOD 1
# def spangram_valid(word_coords:list[list[tuple[int,int]]], direction: str):
#     first, last = word_coords[0], word_coords[-1]
#     if direction == "ltr":
#         return first[1] == 0 and last[1] == 5
#     else:
#         return first[0] == 0 and last[0] == 7

# METHOD 2
def spangram_valid(word_coords:list[tuple[int,int]], direction: str):
    if direction == "ltr":
        return 0 in [coord[1] for coord in word_coords] and 5 in [coord[1] for coord in word_coords]
    else:
        return 0 in [coord[0] for coord in word_coords] and 7 in [coord[0] for coord in word_coords]

In [152]:
def shuffle_grid(letter_coords, words, n):
    letter_coords_cpy = [word[:] for word in letter_coords]
    shuffles = 0

    while shuffles < n:
        a_coord = random.randint(0, 7), random.randint(0, 5)
        a_letter_idx, a_letter, a_word_idx, a_word = get_word_letter_idx(letter_coords_cpy, a_coord, words)

        b_candidate_coords = []
        for i in range(-1,2):
            row = a_coord[0] + i
            if row < 0 or row >= 8:
                continue
            for j in range(-1,2):
                col = a_coord[1] + j
                if col < 0 or col >= 6 or a_coord == (row, col):
                    continue

                b_candidate_coords.append((row, col))
        
        random.shuffle(b_candidate_coords)
        for b_coord in b_candidate_coords:
            b_letter_idx, b_letter, b_word_idx, b_word = get_word_letter_idx(letter_coords_cpy, b_coord, chosen_words)
            if a_word_idx == b_word_idx:
                possible_word_coords = [item[:] for item in letter_coords_cpy[a_word_idx]]
                possible_word_coords[a_letter_idx] = b_coord
                possible_word_coords[b_letter_idx] = a_coord
                if a_word_idx == spangram_idx and not spangram_valid(possible_word_coords, spangram_direction):
                    continue
                if check_word_continuity(possible_word_coords):
                    letter_coords_cpy[a_word_idx] = possible_word_coords
                    # print(f"swapping: {a_coord, b_coord} in same word")
                    shuffles += 1
                    break
            else:
                possible_a_word_coords = letter_coords_cpy[a_word_idx][:]
                possible_a_word_coords[a_letter_idx] = b_coord

                possible_b_word_coords = letter_coords_cpy[b_word_idx][:]
                possible_b_word_coords[b_letter_idx] = a_coord

                if a_word_idx == spangram_idx and not spangram_valid(possible_a_word_coords, spangram_direction):
                    continue

                if b_word_idx == spangram_idx and not spangram_valid(possible_b_word_coords, spangram_direction):
                    continue

                if check_word_continuity(possible_a_word_coords) and check_word_continuity(possible_b_word_coords):
                    letter_coords_cpy[a_word_idx] = possible_a_word_coords
                    letter_coords_cpy[b_word_idx] = possible_b_word_coords
                    # print(f"swapping: {a_coord, b_coord} in different words")
                    shuffles += 1
                    break
    return letter_coords_cpy

In [153]:
new_letter_coords = shuffle_grid(coords, chosen_words, 100000)

In [158]:
new_letter_coords

[[(0, 0), (0, 1), (0, 2), (1, 3), (2, 4)],
 [(1, 0), (1, 1), (1, 2), (0, 3), (0, 4), (0, 5), (1, 5)],
 [(2, 5),
  (1, 4),
  (2, 3),
  (3, 4),
  (3, 3),
  (2, 2),
  (3, 1),
  (2, 1),
  (2, 0),
  (3, 0)],
 [(5, 0), (4, 0), (4, 1), (4, 2), (3, 2), (4, 3)],
 [(6, 1),
  (6, 2),
  (5, 1),
  (5, 2),
  (6, 3),
  (5, 3),
  (6, 4),
  (5, 4),
  (5, 5),
  (4, 5),
  (4, 4),
  (3, 5)],
 [(7, 0), (6, 0), (7, 1), (7, 2), (7, 3), (7, 4), (6, 5), (7, 5)]]

In [154]:
grid = [["" for col in range(6)] for row in range(8)]
for word_coords, word in zip(new_letter_coords, chosen_words):
    for letter_coord, letter in zip(word_coords, word):
        grid[letter_coord[0]][letter_coord[1]] = letter

In [155]:
grid

[['p', 'u', 'l', 't', 'a', 'g'],
 ['f', 'o', 'o', 's', 'e', 'e'],
 ['g', 'n', 'd', 'c', 'e', 'r'],
 ['s', 'i', 'e', 'r', 'o', 'y'],
 ['l', 'i', 'd', 's', 'c', 'n'],
 ['s', 'a', 'n', 'p', 'r', 'e'],
 ['r', 't', 'r', 's', 'a', 'l'],
 ['a', 'r', 'i', 'v', 'a', 's']]

In [156]:
print("\n\n".join(["  ".join(row) for row in grid]))

p  u  l  t  a  g

f  o  o  s  e  e

g  n  d  c  e  r

s  i  e  r  o  y

l  i  d  s  c  n

s  a  n  p  r  e

r  t  r  s  a  l

a  r  i  v  a  s


In [159]:
spangram_idx

2