## Generate Codenames Clues using FastText

In [1]:
## Import 

import numpy as np
from numpy.linalg import norm
from gensim.models import KeyedVectors
import gensim.downloader
from random import shuffle
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from tqdm.notebook import tqdm

In [2]:
## Load fastText vectors

fasttext = KeyedVectors.load('../data/fasttext_vectors.kv')

In [3]:
## Parameters

CODENAME_WORDS_FILE = '../data/codename_words.txt'
HINT_WORDS_FILE = '../data/hint_words.csv'
SUBSTR_LEN = 4
SIM_THRESH = 0.50

In [4]:
## Utility Functions

def cosine_similarity(x,y):
# Calculates cosine similarity of two word vectors via a normalized dot product 
    dp = np.dot(x,y) / (norm(x)*norm(y))
    return dp


def long_comm_substr(s1, s2):
# Returns the length of the longest common substring of two strings
    res = 0;
    for a in range(len(s1)):
        for b in range(len(s2)):
            k = 0;
            while (a+k) < len(s1) and (b+k)<len(s2) and s1[a+k]==s2[b+k]:
                k = k + 1;
            res = max(res, k);
    return res


def are_similar(porter, lancaster, w1, w2):
# Returns true if two words are "similar"
# Two words are similar if they have the same stem, share a common substr, or if one contains the other
# Porter stemming and Lancaster stemming are used via NLTK package
    if w1.__contains__(w2) or w2.__contains__(w1):
        #print("[are_similar] OUTPUT: %s and %s are similar: one contains the other"%(w1,w2))
        return True
    elif porter.stem(w1) == porter.stem(w2):
        #print("[are_similar] OUTPUT: %s and %s are similar: Porter Stemmer"%(w1,w2))
        return True
    elif lancaster.stem(w1) == lancaster.stem(w2):
        #print("[are_similar] OUTPUT: %s and %s are similar: Lancaster Stemmer"%(w1,w2))
        return True
    elif long_comm_substr(w1,w2) >= SUBSTR_LEN:
        #print("[are_similar] OUTPUT: %s and %s are similar: longest common substring too long"%(w1,w2))
        return True
    else:
        return False


In [5]:
## Main functions
    
def load_hint_words():
# Returns a list of hint words read from csv file
# Note: Later change this to the list of words in fasttext
    hint_words = pd.read_csv(HINT_WORDS_FILE, index_col=0).hints.tolist()
    hint_words = remove_unseen_words(hint_words)
    return hint_words


def load_codename_words():
# Returns a list of words from codenames 
# Turns words lower case and removes spaces from compound words
# Removes a word from the game if it's not in fasttext model
    with open(CODENAME_WORDS_FILE, 'r') as f:
        uppercase_words = f.readlines()

    # Turn words lower case and remove spaces from compound words
    game_words = [w.lower().strip().replace(" ", "") for w in uppercase_words]
    
    # Removes a word from game if it has no corresponding fasttext word vector
    game_words = remove_unseen_words(game_words)
    
    # Return a list of game words
    return game_words


def get_example_board(codenames_words):
# Returns a list of lists representing game board
# [[team_words], [enemy_words], [neutral_words], [double_agent_word]]
    words = codenames_words.copy()
    shuffle(words)
    return [
        words[0:9], #team words - 9
        words[9:17], #enemy words - 8
        words[17:24], #neutral words - 7
        [words[24]] #double agent word - 1
    ]


def remove_unseen_words(inlist):
# Removes words from list that are not found in fasttext model
# Returns a "cleaned" copy of list
# Note: original list passed in remains unaltered
    outlist = []
    for w in inlist:
        if w in fasttext.key_to_index:
            outlist.append(w)
        else:
            print("[remove_unseen_words] OUTPUT: %s not found in fasttext -> excluded from words list"%w)
            continue
    return outlist


In [6]:
class CodenamesClueGenerator:
    # Variables
    game_similarity = []
    
    SUBSTR_LEN = 4
    SIM_THRESH = 0.5
    
    # Initialize stemmers for word similarity
    porter = PorterStemmer()
    lancaster = LancasterStemmer()

    # Load fastText vectors
    fasttext = KeyedVectors.load('../data/fasttext_vectors.kv')
    sim_mat = pd.read_csv('../data/similarity_matrix.csv', index_col=0).drop_duplicates()
    
    # Functions
    def __init__(self, board):
        self.board = board
        self.past_hints = []
        
        # Unpack board info into separate lists
        self.team_words, self.enemy_words, self.neutral_words, self.double_agent_word = board

        # Get a list of all game words
        self.board_words = self.team_words + self.enemy_words + self.neutral_words + self.double_agent_word
        
        # Get a similarity matrix for hint words x board words
        self.game_similarity = self.sim_mat.loc[:, self.board_words]
        
    def update(self, board):
        # Set board
        self.board = board
        
        # Unpack board info into separate lists
        self.team_words, self.enemy_words, self.neutral_words, self.double_agent_word = board

        # Get a list of all game words
        self.board_words = self.team_words + self.enemy_words + self.neutral_words + self.double_agent_word
        
        # Get a similarity matrix for hint words x board words
        self.game_similarity = self.game_similarity.loc[:, self.board_words]
        
    def give_hints(self):
        # Keep track of "best" hint words
        best_score = 0
        best_words = []
        best_corr_words = []

        # For each row (hint)
        for row in self.game_similarity.iterrows():

            # Get hint word
            hint = row[0]

            # Sort similarity scores for each game word x this hint
            df = row[1].sort_values(ascending=False)

            # Store score for this hint
            score = 0
            corr_words = []

            # For each game word, in descending order of similarity to hint 
            for game_word in df.index.tolist():
                # If a game word belongs to our team and is above the threshold, increment score
                if game_word in self.team_words and df[game_word] > self.SIM_THRESH:
                    score += 1
                    corr_words.append(game_word)
                    continue
                # If word does not belong to team, stop increasing score for this hint
                break

            # Update best score
            if best_score < score and self.valid_hint(hint) and hint not in self.past_hints: 
                best_score = score
                best_words = [hint]
                best_corr_words = [corr_words]
            # Add this hint to list if tied
            elif best_score == score and self.valid_hint(hint) and hint not in self.past_hints: 
                best_words.append(hint)
                best_corr_words.append(corr_words)

        # Return all the tied-for-best hints
        
        self.past_hints.append(best_words[0])
        return best_words, best_score, best_corr_words

    def remove_similar_hints(self, hint_words):
    # Removes hints from hint list that are too similar to words on board
    # Note: original hint_words list remains unchanged
    # Note: function not used in implementation; provided for test purposes
        res = []
        for h in tqdm(hint_words):
            too_similar = False
            for b in self.board_words:
                if are_similar(self.porter, self.lancaster, h, b):
                    too_similar = True
                    break
            if not too_similar:
                res.append(h)
        return res

    def valid_hint(self, hint):
    # Returns false if hint is too similar to any words on board
    # Otherwise, returns true
        valid = True
        for board_word in self.board_words:
            if are_similar(self.porter, self.lancaster, hint, board_word):
                return False
        return True

In [7]:
## Test Cell

# are_similar("happy", "happier")
# are_similar("clouds", "cloudy")
# are_similar("trouble", "troubling")
# are_similar("troubling", "trout")
# are_similar("nineteen", "eighteen")


In [8]:
## Test Cell

# fasttext = KeyedVectors.load('../data/fasttext_vectors.kv')
# game_words = load_codename_words()
# board = get_example_board(game_words)

# CCG = CodenamesClueGenerator(board)

# hints = CCG.give_hints()
# print(hints)