# initialize.ipynb: Run this file once to initialize the program directory.
#### It will:
####   - Install the necessary python modules
####   - Load in the FastText model and save to a file for faster load in future --> ../data/fasttext_vectors.kv
####   - Create a similarity matrix for hint words --> ../data/similarity_matrix.csv


In [1]:
## Installs

!pip install tqdm
!pip install nltk
!pip install ipynb



In [2]:
# Libraries 

import numpy as np
from numpy.linalg import norm
from gensim.models import KeyedVectors
import gensim.downloader
from random import shuffle
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from tqdm.notebook import tqdm

In [3]:
## Download FastText model and save to file (Runtime: ~6 mins)

import gensim.downloader

fasttext = gensim.downloader.load('fasttext-wiki-news-subwords-300')
fasttext.save('../data/fasttext_vectors.kv')


In [4]:
# Loading FastText model

fasttext = KeyedVectors.load('../data/fasttext_vectors.kv')

In [5]:
# String paths to necessary files

CODENAME_WORDS_FILE = '../data/codename_words.txt'
HINT_WORDS_FILE = '../data/hint_words.csv'

In [6]:
def cosine_similarity(x,y):
# Calculates cosine similarity of two word vectors via a normalized dot product 
    dp = np.dot(x,y) / (norm(x)*norm(y))
    return dp

def create_sim_mat(fasttext, hint_words, game_words):
# Stores the similarities of hint/game word combinations in a .csv file

    # Create a pandas df to store similarities of each game word/hint word pair
    data = pd.DataFrame(np.zeros((len(hint_words), len(game_words))))

    # For each hint word
    for i in tqdm(range(len(hint_words))):

        # Find fasttext word vector for hint word
        hwv = fasttext[hint_words[i]]

        # For each game word
        for j in range(len(game_words)):

            # Find fasttext word vector for hint word
            gwv = fasttext[game_words[j]]

            # Calculate and store cos similarity of game word and hint word
            data.iloc[i, j] = cosine_similarity(gwv, hwv)

    print(data.head())
    
    # Save dataframe to .csv file
    data.index = hint_words
    data.columns = game_words
    data.index = data.index.str.lower()
    data.columns = data.columns.str.lower()
    data.to_csv('../data/similarity_matrix.csv')
    
def load_hint_words():
# Returns a list of hint words read from csv file
# Note: Later change this to the list of words in fasttext
    hint_words = pd.read_csv(HINT_WORDS_FILE, index_col=0).hints.tolist()
    hint_words = remove_unseen_words(hint_words)
    return hint_words


def load_codename_words():
# Returns a list of words from codenames 
# Turns words lower case and removes spaces from compound words
# Removes a word from the game if it's not in fasttext model
    with open(CODENAME_WORDS_FILE, 'r') as f:
        uppercase_words = f.readlines()

    # Turn words lower case and remove spaces from compound words
    game_words = [w.lower().strip().replace(" ", "") for w in uppercase_words]
    
    # Removes a word from game if it has no corresponding fasttext word vector
    game_words = remove_unseen_words(game_words)
    
    # Return a list of game words
    return game_words

def remove_unseen_words(inlist):
# Removes words from list that are not found in fasttext model
# Returns a "cleaned" copy of list
# Note: original list passed in remains unaltered
    outlist = []
    for w in inlist:
        if w in fasttext.key_to_index:
            outlist.append(w)
        else:
            print("[remove_unseen_words] OUTPUT: %s not found in fasttext -> excluded from words list"%w)
            continue
    return outlist


In [7]:
## Create similarity matrix and save to file (Runtime: ~18 mins)

# Load the hint words and the game (codenames) words
hint_words = load_hint_words()
game_words = load_codename_words()

# Store similarity scores of hint/game words in .csv file
create_sim_mat(fasttext, hint_words, game_words)


[remove_unseen_words] OUTPUT: pdas not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: zus not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: anaheim not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: mpegs not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: greensboro not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: usps not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: mrna not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: cdna not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: liechtenstein not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: swaziland not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: lochness not found in fasttext -> excluded from words list
[remove_unseen_words] OUTPUT: scubadiv

  0%|          | 0/8792 [00:00<?, ?it/s]

KeyboardInterrupt: 