In [9]:
import itertools
import os

import gensim
import gensim.downloader
import numpy as np


In [10]:
# word2vec_path = "models/GoogleNews-vectors-negative300-SLIM"
# w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
#     word2vec_path + ".bin.gz", binary=True
# )
# w2v_model.save(word2vec_path + ".kv")
# del word2vec_path
# del w2v_model


In [None]:
# model = "glove-wiki-gigaword-300"
# model = "word2vec-google-news-300" # too big
model = "GoogleNews-vectors-negative300-SLIM"
# model = "fasttext-wiki-news-subwords-300"
# model = "glove-twitter-200"
if (model + ".kv") in os.listdir("models"):
    word2vec = gensim.models.KeyedVectors.load(f"models/{model}.kv")
else:
    word2vec: gensim.models.word2vec.KeyedVectors = gensim.downloader.load(model)
    word2vec.save(f"models/{model}.kv")

In [12]:
word2vec.most_similar(["king", "sailboat"], ["sparrow"], topn=5)


[('yacht', 0.5311225056648254),
 ('boat', 0.48425018787384033),
 ('catamaran', 0.4792361259460449),
 ('powerboat', 0.4713691473007202),
 ('ketch', 0.4468856751918793)]

In [13]:
# Create a codenames board
with open("wordlist-eng.txt") as f:
    words = f.readlines()
    words = [word.strip().lower() for word in words]
board = np.random.choice(words, size=(5, 5), replace=False)
print(board)

[['mammoth' 'ray' 'change' 'knight' 'point']
 ['brush' 'drop' 'swing' 'mouth' 'crown']
 ['bow' 'bill' 'net' 'ham' 'kid']
 ['tablet' 'plot' 'chair' 'mexico' 'shoe']
 ['tooth' 'post' 'pin' 'lead' 'pirate']]


In [14]:
# Create a team card
special_words = np.random.choice(range(25), size=18, replace=False)
flat_board = board.flatten()
red_words = flat_board[special_words[:8]]
blue_words = flat_board[special_words[8:17]]
assassin = flat_board[special_words[17]]
print(f"Red Words: {red_words}")
print(f"Blue Words: {blue_words}")
print(f"Assassin: {assassin}")


Red Words: ['lead' 'chair' 'drop' 'swing' 'knight' 'tablet' 'pin' 'mammoth']
Blue Words: ['pirate' 'post' 'net' 'change' 'point' 'brush' 'crown' 'ham' 'mexico']
Assassin: mouth


In [15]:
def has_common_substring(a, b, min_length=3):
    """
    Check if any substring of length >= min_length from string `a` is in string `b`.

    :param a: First string
    :param b: Second string
    :param min_length: Minimum length of the substring (default: 2)
    :return: True if any substring of length >= min_length in `a` is in `b`, otherwise False
    """
    len_a = len(a)
    for start in range(len_a):
        for end in range(start + min_length, len_a + 1):
            substring = a[start:end]
            if substring in b:
                return True
    return False


In [None]:
# generate a 2-word clue for red team
clue = ["", 0, ("", "")]
bad_words = tuple(list(blue_words) + list(assassin))


def find_best_hint(words):
    for subset in itertools.combinations(blue_words, 2):
        similars = word2vec.most_similar(positive=list(subset), topn=15)
        for attempt in similars:
            # make sure the clue is not a word on the board
            if attempt[0] in flat_board:
                continue
            # make sure ther clue is not a form of one of the prompts
            if has_common_substring(attempt[0], subset[0]) or has_common_substring(
                attempt[0], subset[1]
            ):
                continue

            # if everything is good, pick this word as our clue
            similar = attempt
            break
        if similar[1] > clue[1]:
            clue = list(similar)
            clue.append(subset)


print("best clue:", clue)


('pirate', 'post') ('buccaneers', 0.4359492361545563)
('pirate', 'net') ('buccaneer', 0.45381754636764526)
('pirate', 'change') ('alter', 0.48260003328323364)
('pirate', 'point') ('Arrr', 0.40302774310112)
('pirate', 'brush') ('lubbers', 0.44780853390693665)
('pirate', 'crown') ('title', 0.5656847953796387)
('pirate', 'ham') ('eggers', 0.5003951787948608)
('pirate', 'mexico') ('caribbean', 0.5270410180091858)
('post', 'net') ('period', 0.4296586811542511)
('post', 'change') ('alter', 0.5471648573875427)
('post', 'point') ('juncture', 0.388338565826416)
('post', 'brush') ('deadfall', 0.38540056347846985)
('post', 'crown') ('title', 0.5740607976913452)
('post', 'ham') ('turkey', 0.4413197636604309)
('post', 'mexico') ('usa', 0.5434982776641846)
('net', 'change') ('adjustment', 0.5015605688095093)
('net', 'point') ('goal', 0.5194658637046814)
('net', 'brush') ('forehanded', 0.43671202659606934)
('net', 'crown') ('title', 0.5637132525444031)
('net', 'ham') ('escalopes', 0.4732958972454071)

In [24]:
def cos_similar(a, b):
    "convenience function to calculate the cosine similarity between two vectors"
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


# find the best 2-word pair
def find_best_subset(word_list: list, set_size=2):
    "Finds the best subset of words by consecutive removal of the worst word"
    if len(word_list) == set_size:
        return word_list
    else:
        word_list.remove(word2vec.doesnt_match(word_list))
        return find_best_subset(word_list, set_size)


print("words:", red_words)
print("worst removal:", find_best_subset(list(red_words)))
print("best pair:", find_best_subset(list(red_words)))
print("best 3:", find_best_subset(list(red_words), 3))
print("best 4:", find_best_subset(list(red_words), 4))


def find_best_subset(word_list: list, set_size=2):
    "Finds the best subset of words by rating all possible subsets"
    # generate all possible subsets
    subsets = itertools.combinations(word_list, set_size)
    best_subset = ([], 0)
    for subset in subsets:
        # first find the centroid of all the vectors
        centroid = np.mean([word2vec[word] for word in subset], axis=0)
        # then find the similarity of the centroid to each word in the subset
        similarity = np.mean([cos_similar(word2vec[word], centroid) for word in subset])
        # if the similarity is better than the current best, update the best
        if similarity > best_subset[1]:
            best_subset = (subset, similarity)
    return best_subset


print("best grouping:")
print("best pair:", find_best_subset(list(red_words)))
print("best 3:", find_best_subset(list(red_words), 3))
print("best 4:", find_best_subset(list(red_words), 4))

words: ['lead' 'chair' 'drop' 'swing' 'knight' 'tablet' 'pin' 'mammoth']
worst removal: ['drop', 'swing']
best pair: ['drop', 'swing']
best 3: ['drop', 'swing', 'pin']
best 4: ['lead', 'drop', 'swing', 'pin']
best grouping:
best pair: (('chair', 'knight'), 0.7760649)
best 3: (('drop', 'swing', 'pin'), 0.6725464)
best 4: (('lead', 'drop', 'swing', 'pin'), 0.6030851)


In [None]:
# Start by finding thed best cluster of 2, 3, or 4 words.

In [None]:
# get opencv up n runnin