# Creating a gold standard from existing Taboo cards

## Reading in and formatting the cards

The Taboo cards that our gold standard will be based on belong to Elizabeth's Canadian edition of Taboo, produced sometime in the 1990s or early 2000s.

In [7]:
FILENAME = "taboo_cards.txt"


def read_in(filename):
    """
    Reads in transcribed Taboo card words contained in the given file and returns them in an enumerated list.
    """
    file_lines = []
    
    with open(filename, "r", encoding='utf-8') as myfile:
        
        # Go through every line, saving non-empty ones to the list file_lines.
        for line in myfile:
            if line.strip() != '':             
                file_lines.append(line.strip()) 
                
    return list(enumerate(file_lines))
    
            
def format_cards(enum_list):
    """
    Given an enumerated list (output of read_in()), formats the contents as a dictionary (key = MW, values = list of TWs)
    """
    
    # Initialise dictionary to contain card data.
    card_dict = dict()
    
    # Assign MWs (every sixth word in the enumerated list) as dictionary keys, and create a list for the dict's
    # value consisting of the five following words (the TWs); the word[1:] removes the dash from the beginning of
    # each TW's string.

    for enum, wd in enum_list:
        if enum % 6 == 0:
            card_dict[wd] = [word[1:] for num, word in enum_list[enum+1:enum+6]]
        
    return card_dict
        

enum_lines = read_in(FILENAME)
cards = format_cards(enum_lines)

In [8]:
# Example: the five TWs from the MW 'syrup'
cards['syrup']

['maple', 'pancakes', 'trees', 'sap', 'sweet']

## Computing semantic similarity with gensim

In [9]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [29]:
# model.similarity('alpaca', 'dog')

# Example model similarity for MW / TWs
for tw in cards['syrup']:
    print('Similarity of {0} to MW: {1}'.format(tw, model.similarity('syrup', tw)))

Similarity of maple to MW: 0.4812118709087372
Similarity of pancakes to MW: 0.4521259069442749
Similarity of trees to MW: 0.22967348992824554
Similarity of sap to MW: 0.40749743580818176
Similarity of sweet to MW: 0.35497915744781494


## Computing collocation measures