In [27]:
from FlagEmbedding import FlagModel
import pickle
from scipy.spatial import KDTree
import random
import itertools
import numpy as np

In [2]:
def generate_n_cont_set(word, n):
    n_letters_word = set()
    for i in range(len(word) - n + 1):
        n_letters_word.add(word[i:i+n])
    return n_letters_word

In [3]:
def shares_n_cont_letters(a: str, b: str, n: int):
    n_letters_a = set()
    for i in range(len(a) - n + 1):
        n_letters_a.add(a[i:i+n])

    for i in range(len(b) - n + 1):
        if b[i:i+n] in n_letters_a:
            return True
    return False

In [401]:
def rotate_grid(grid):
    return [[tuple(item) for item in row] for row in np.rot90(np.array(grid))]

In [4]:
model = FlagModel('BAAI/bge-small-en-v1.5',
                  query_instruction_for_retrieval="Generate a representation for this word for retrieving related words:",
                  use_fp16=True)



In [33]:
with open("words.txt") as f:
    words = f.readlines()

In [6]:
embeddings = model.encode(words)

Inference Embeddings: 100%|██████████| 1823/1823 [05:50<00:00,  5.20it/s]


In [7]:
with open("embeddings.pkl", "wb+") as f:
    pickle.dump(embeddings, f)

In [8]:
with open("embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

In [9]:
embeddings.shape

(466550, 384)

In [10]:
tree = KDTree(embeddings)

In [485]:
query = "Photography"

In [486]:
query_embedding = model.encode(query)

In [487]:
# Get the 1000 most relevant words/phrases
dd, ii = tree.query(query_embedding, k=1000)

In [488]:
# Choose pangrammables
pangrammable = []
for i in ii:
    candidate = words[i].strip()
    candidate_alpha_only = "".join([char for char in candidate if char.isalpha()])
    if 6 <= len(candidate_alpha_only) <= 13 and candidate.count("-") == 1:
        pangrammable.append(candidate)

In [489]:
pangram = random.choice(pangrammable[:100])
pangram

'sea-framing'

In [490]:
subembeddings = [embeddings[i] for i in ii]
subwords = [words[i] for i in ii]
subtree = KDTree(subembeddings)

In [491]:
# Get the 1000 most relevant words/phrases
ddd, iii = subtree.query(model.encode(pangram), k=500)

In [492]:
candidates: list[str] = []
n = 4
candidate_n_cont_pool: set[str] = set()
for i in iii:
    word = subwords[i].strip().lower()
    word_set = generate_n_cont_set(word, n)
    unique = True
    for item in word_set:
        if item in candidate_n_cont_pool:
            unique = False
            break
    if unique:
        candidates.append(word)
        candidate_n_cont_pool.update(word_set)

In [493]:
candidates

['sea-framing',
 'oceanographies',
 'photofinishing',
 'picture-hanging',
 'watercolour',
 'artwork',
 'raying',
 'scenes',
 'portfolio',
 'stoving',
 'canons',
 'imagery',
 'painting',
 'optics',
 'shadings',
 'foregallery',
 'captivities',
 'shutters',
 'landscape',
 'lensing',
 'filmizing',
 'cannon-shot',
 'magnification',
 'visually',
 'techniques',
 'aerial',
 'piccaninnies',
 'backdrop',
 'close-up',
 'vision-directed',
 'outdoor',
 'artistic',
 'multicamerate',
 'art',
 'monocular',
 'perspectively',
 'outside',
 'semiexposed',
 'piceous',
 'short-cropped',
 'retouches',
 'picoted',
 'reflectance',
 'montages',
 'kodaking',
 'shoots',
 'tripodial',
 'saliencies',
 'pixels',
 'art.',
 'stills',
 'exhibited',
 'thumbnails',
 'camaraderies',
 'prints',
 'panoramas',
 'cams',
 'pic']

In [494]:
# Separate the candidates into plain words and phrases
plain_words = []
phrases = []
for candidate in candidates:
    if len(candidate) <= 3:
        continue
    if candidate.isalpha():
        plain_words.append(candidate)
    else:
        phrases.append(candidate)

In [495]:
plain_words

['oceanographies',
 'photofinishing',
 'watercolour',
 'artwork',
 'raying',
 'scenes',
 'portfolio',
 'stoving',
 'canons',
 'imagery',
 'painting',
 'optics',
 'shadings',
 'foregallery',
 'captivities',
 'shutters',
 'landscape',
 'lensing',
 'filmizing',
 'magnification',
 'visually',
 'techniques',
 'aerial',
 'piccaninnies',
 'backdrop',
 'outdoor',
 'artistic',
 'multicamerate',
 'monocular',
 'perspectively',
 'outside',
 'semiexposed',
 'piceous',
 'retouches',
 'picoted',
 'reflectance',
 'montages',
 'kodaking',
 'shoots',
 'tripodial',
 'saliencies',
 'pixels',
 'stills',
 'exhibited',
 'thumbnails',
 'camaraderies',
 'prints',
 'panoramas',
 'cams']

In [496]:
plain_lengths = [len(plain_word) for plain_word in plain_words]

In [497]:
pangram = "".join([char for char in pangram if char.isalpha()])
pangram

'seaframing'

In [498]:
max_words = (48 - len(pangram))//4
max_words

9

In [499]:
num_words = random.choice(list(range(6, max_words + 1)))

In [500]:
num_words

9

In [501]:
budget = 48 - len(pangram)

In [502]:
chosen = []
plain_lengths_cpy = plain_lengths[:]
while sum(chosen) != budget:
    remaining = budget - sum(chosen)
    if sum(chosen) < budget:
        if remaining in plain_lengths_cpy:
            chosen.append(remaining)
            break
        else:
            new_chosen = random.choice(plain_lengths_cpy)
            plain_lengths_cpy.remove(new_chosen)
            chosen.append(new_chosen)
    elif sum(chosen) > budget:
        if (-remaining) in chosen:
            chosen.remove(-remaining)
            break
        else:
            for i in range(random.choice([1,1,1,2,2,3])):
                to_remove = random.choice(chosen)
                chosen.remove(to_remove)
                plain_lengths_cpy.append(to_remove)


In [503]:
chosen

[6, 11, 8, 13]

In [504]:
word_lens: dict[int, list[str]] = {}
for word in plain_words:
    word_len = len(word)
    if word_len not in word_lens:
        word_lens[word_len] = [word]
    else:
        word_lens[word_len].append(word)

chosen_words = [pangram]
for length in chosen:
    chosen_word = random.choice(word_lens[length])
    word_lens[length].remove(chosen_word)
    chosen_words.append(chosen_word)

In [505]:
chosen_words

['seaframing', 'pixels', 'semiexposed', 'painting', 'multicamerate']

In [506]:
if len(pangram) < 8:
    pangram_direction = "ltr"
else:
    pangram_direction = random.choice(["ltr", "ttb"])

In [507]:
pangram_direction

'ttb'

In [508]:
grid = []
for i in range(8):
    row = []
    for j in range(6):
        row.append((i,j))
    grid.append(row)

if pangram_direction == "ttb":
    grid = rotate_grid(grid)
coord_lst = grid.pop(0)
while len(grid) > 0:
    grid = rotate_grid(grid)
    coord_lst.extend(grid.pop(0))

In [509]:
coords: list[list[tuple[int,int]]] = []

for word in chosen_words:
    word_coords = []
    for char in word:
        word_coords.append(coord_lst.pop(0))
    coords.append(word_coords)

In [510]:
def get_word_letter_idx(coords, letter_coords, words):
    for word_idx, word_coords in enumerate(coords):
        if letter_coords in word_coords:
            letter_idx = word_coords.index(letter_coords)
            word = words[word_idx]
            letter = word[letter_idx]
            return letter_idx, letter, word_idx, word
    print("coords:", coords)
    print("letter_coords:", letter_coords)
    print("words:", words)

In [511]:
def check_word_continuity(word_coords:list[list[tuple[int,int]]]):
    for coords_a, coords_b in zip(word_coords[:-1], word_coords[1:]):
        if abs(coords_a[0] - coords_b[0]) > 1 or abs(coords_a[1] - coords_b[1]) > 1:
            return False
    return True

In [512]:
def pangram_valid(word_coords:list[list[tuple[int,int]]], direction: str):
    first, last = word_coords[0], word_coords[-1]
    if direction == "ltr":
        return first[1] == 0 and last[1] == 5
    else:
        return first[0] == 0 and last[0] == 7

In [513]:
def shuffle_grid(letter_coords, words, n):
    letter_coords_cpy = [word[:] for word in letter_coords]
    shuffles = 0

    while shuffles < n:
        a_coord = random.randint(0, 7), random.randint(0, 5)
        a_letter_idx, a_letter, a_word_idx, a_word = get_word_letter_idx(letter_coords_cpy, a_coord, words)

        b_candidate_coords = []
        for i in range(-1,2):
            row = a_coord[0] + i
            if row < 0 or row >= 8:
                continue
            for j in range(-1,2):
                col = a_coord[1] + j
                if col < 0 or col >= 6 or a_coord == (row, col):
                    continue

                b_candidate_coords.append((row, col))
        
        random.shuffle(b_candidate_coords)
        for b_coord in b_candidate_coords:
            b_letter_idx, b_letter, b_word_idx, b_word = get_word_letter_idx(letter_coords_cpy, b_coord, chosen_words)
            if a_word_idx == b_word_idx:
                possible_word_coords = [item[:] for item in letter_coords_cpy[a_word_idx]]
                possible_word_coords[a_letter_idx] = b_coord
                possible_word_coords[b_letter_idx] = a_coord
                if a_word_idx == 0 and not pangram_valid(possible_word_coords, pangram_direction):
                    continue
                if check_word_continuity(possible_word_coords):
                    letter_coords_cpy[a_word_idx] = possible_word_coords
                    # print(f"swapping: {a_coord, b_coord} in same word")
                    shuffles += 1
                    break
            else:
                possible_a_word_coords = letter_coords_cpy[a_word_idx][:]
                possible_a_word_coords[a_letter_idx] = b_coord

                possible_b_word_coords = letter_coords_cpy[b_word_idx][:]
                possible_b_word_coords[b_letter_idx] = a_coord

                if a_word_idx == 0 and not pangram_valid(possible_a_word_coords, pangram_direction):
                    continue

                if b_word_idx == 0 and not pangram_valid(possible_b_word_coords, pangram_direction):
                    continue

                if check_word_continuity(possible_a_word_coords) and check_word_continuity(possible_b_word_coords):
                    letter_coords_cpy[a_word_idx] = possible_a_word_coords
                    letter_coords_cpy[b_word_idx] = possible_b_word_coords
                    # print(f"swapping: {a_coord, b_coord} in different words")
                    shuffles += 1
                    break
    return letter_coords_cpy

In [514]:
new_letter_coords = shuffle_grid(coords, chosen_words, 100000)

In [515]:
grid = [["" for col in range(6)] for row in range(8)]
for word_coords, word in zip(new_letter_coords, chosen_words):
    for letter_coord, letter in zip(word_coords, word):
        grid[letter_coord[0]][letter_coord[1]] = letter

In [516]:
grid

[['e', 'e', 'x', 'p', 'o', 's'],
 ['s', 'm', 'i', 'd', 'e', 's'],
 ['u', 'l', 'i', 'l', 'e', 'a'],
 ['m', 't', 'e', 'c', 's', 'f'],
 ['a', 'm', 'a', 'x', 'i', 'r'],
 ['p', 'i', 'e', 'r', 'a', 'p'],
 ['g', 't', 'n', 'a', 'm', 'i'],
 ['i', 'n', 't', 'e', 'n', 'g']]

In [517]:
print("\n\n".join(["  ".join(row) for row in grid]))

e  e  x  p  o  s

s  m  i  d  e  s

u  l  i  l  e  a

m  t  e  c  s  f

a  m  a  x  i  r

p  i  e  r  a  p

g  t  n  a  m  i

i  n  t  e  n  g


In [518]:
chosen_words

['seaframing', 'pixels', 'semiexposed', 'painting', 'multicamerate']