### What are the words we care about collecting?
Ignore the ones we already have token files for

In [1]:
import datasets
import os

words_to_collect = []


"""
1) the words we want to collect data for
"""
men = datasets.get_men()
verbsim = datasets.get_verbsim()
ws353_rel = datasets.get_ws353_rel()
bless = datasets.get_bless()

# get all the words
all_words = []
for dataset in [men, verbsim, ws353_rel]:
    for row in dataset:
        w1 = row['word1']
        w2 = row['word2']
        all_words.append(w1)
        all_words.append(w2)
        
unique_words = set(all_words)

for word in unique_words:
    pathname = os.path.join("./data/word_data/", word)
    # collect this new word unless its not new and we already have data for it
    if not os.path.isdir(pathname):
        words_to_collect.append(word)
        

print("Total words between 353_rel, simverb, and men: %s" % len(all_words))
print("Unique words between 353_rel, simverb, and men: %s" % len(unique_words))
print("New words that we don't have tokens collected yet for yada yada: %s" % len(words_to_collect))

"""
2) the layers we want to analzye
"""
layers = [0,1,5,11]

"""
3) The cluster sizes we want to analyze
"""
cluster_sizes = [1,3,5,7]

processed 3000 word pairs from MEN relatedness dataset
processed 130 word pairs from VerbSim dataset
processed 252 word pairs from WordSim relatedness dataset
processed 26554 word pairs from BLESS dataset
Total words between 353_rel, simverb, men, and bless: 6764
Unique words between 353_rel, simverb, men, and bless: 1164
New words that we don't have tokens collected yet for yada yada: 708


In [2]:
# collect token data...again

In [4]:
import grinders

grinders.collect_bnc_tokens_for_words(words_to_collect, override=True, outfile='bnc_words_with_similarity_tokens.csv')


# Sentences in BNC corpus: 6026276
shuffling indexes
done shuffling
[250087, 3235411, 5844650, 3531151, 5392874, 5323871, 1656272, 1046852, 406747, 1658543, 4051637, 3046040, 1870259, 5595167, 5202262, 4889351, 2494284, 1804113, 5927710, 5076530, 2626732, 3786115, 1931190, 235848, 2489126, 387482, 1298999, 1215573, 173734, 1522570, 1835438, 91688, 312228, 1572779, 230989, 2050391, 3979353, 151342, 5708478, 5882787, 4196364, 3761369, 4966045, 5885394, 4460127, 4337645, 1563753, 1787210, 3741152, 4736237]
Processed 100000 sentences
Processed 200000 sentences
Processed 300000 sentences
Processed 400000 sentences
Processed 500000 sentences
Processed 600000 sentences
Processed 700000 sentences
Processed 800000 sentences
Processed 900000 sentences
Processed 1000000 sentences
Processed 1100000 sentences
Processed 1200000 sentences
Processed 1300000 sentences
Processed 1400000 sentences
Processed 1500000 sentences
Processed 1600000 sentences
Processed 1700000 sentences
Processed 1800000 senten

### Sort dataset of BNC tokens into files for each word

In [5]:
import os
import csv

ALLWORDS_DIR = './data/word_data'

# you already have tokens collected for each word in simlex and wordsim
# now these tokens ought to be sorted into their own files

# ensure that there is a word_data directory to store in our words
# you have to delete it first with rm -rf if we are reloading
# os.mkdir(ALLWORDS_DIR)


# create files for each word we care about
for word in words_to_collect:
    word_dir = os.path.join(ALLWORDS_DIR, word)
    os.mkdir(word_dir)


# read in the big long file
with open('./data/bnc_words_with_similarity_tokens.csv', mode="r") as infile:
    fieldnames = ["word", "sentence", "POS", "id"]
    reader = csv.DictReader(infile, delimiter="\t", quoting=csv.QUOTE_NONNUMERIC, fieldnames=fieldnames)
    
    # split the big long file into smaller, sorted files that are easier to process one at a time
    for row in reader:
        
        word = row["word"]
        text = row["sentence"]
        pos = row["POS"]
        uid = "BNC_" + str(int(row["id"]))

        # open file for this word to spit tokens into
        token_file = os.path.join(ALLWORDS_DIR, word, "BNC_tokens.csv")
        with open(token_file, mode="a") as outfile:
            # finally, write all of the info with the vector to disk
            writer = writer = csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow([word, text, pos, uid])

In [None]:
# check that we have files for the new words

In [6]:
print(words_to_collect[:100])

['anxiety', 'sculpture', 'surfer', 'pumpkin', 'downtown', 'yellow', 'cafe', 'reading', 'sphere', 'sunny', 'discovery', 'dude', 'jump', 'crew', 'zebra', 'trade', 'sport', 'crisis', 'concert', 'parking', 'row', 'clock', 'toy', 'lighthouse', 'forecast', 'arrangement', 'musician', 'cake', 'bead', 'keyboard', 'burger', 'windmill', 'handle', 'race', 'outfit', 'wave', 'network', 'auto', 'isolate', 'snowman', 'call', 'recovery', 'drive', 'archive', 'depression', 'arrival', 'town', 'war', 'mirror', 'reptile', 'strawberry', 'propose', 'gasoline', 'welcome', 'sand', 'credit', 'miniature', 'store', 'boardwalk', 'dragonfly', 'pug', 'military', 'research', 'lizard', 'dissipate', 'cheerleader', 'refine', 'tropical', 'temple', 'piano', 'racing', 'stone', 'interior', 'environment', 'traffic', 'candle', 'fly', 'painting', 'pattern', 'bruise', 'rainbow', 'seagull', 'hamster', 'whale', 'terminate', 'statue', 'wing', 'stripe', 'fingerprint', 'gold', 'pepper', 'blue', 'weapon', 'lily', 'noodle', 'black', 'b

In [None]:
# how many of each new word do we have?

In [12]:
import numpy as np
ALLWORDS_DIR = './data/word_data'


token_counts = []
n = 0

collected_words = words_to_collect
for word in words_to_collect:
    wordfile = os.path.join(ALLWORDS_DIR, word, 'BNC_tokens.csv')   

    try:
        with open(wordfile, mode="r") as infile:
            fieldnames = ["word", "sentence", "POS", "id"]
            reader = csv.DictReader(infile, delimiter="\t", quoting=csv.QUOTE_NONNUMERIC, fieldnames=fieldnames)

            count = 0
            for row in reader:
                count +=1
            token_counts.append(count)
            n = n+1
    except:
        print("No tokens collected for %s" % word)
        collected_words.remove(word)
average = np.sum(token_counts) / n
print("number of unique words between sl999 and ws353 we've collected tokens for: %s" % n)
print("average number of tokens per word: %s" % average)


No tokens collected for ipod
No tokens collected for figure out
number of unique words between sl999 and ws353 we've collected tokens for: 704
average number of tokens per word: 97.20170454545455


In [None]:
# Calculate and save clusters for these new words.

In [13]:
import os, shutil
import numpy as np
import bert_helper
import csv
from sklearn.cluster import KMeans


"""
for each word file we have, do the following:
    for each layer we care about, calculate the token embedding at that layer for each token
        for each number of clusters we care about, calculate the centroids of those clusters
        
store results in a file, one for each word+layer+cluster_number combo, resulting in a file structure like the following:

word_data/
  |-airplane/
  | |- bnc_tokens.csv
  | |- layer_0_k_1_clusters.csv
  | |   ...
  | |- layer_0_k_7_clusters.csv
  | |   ...
  | |- layer_11_k_7_clusters.csv
  
each cluster file is a csv with the following fields:
    word
    layer
    cluster_size_k
    cluster_number
    centroid
    token_ids

"""

(model, tokenizer) = bert_helper.initialize()

i = 0
for word in collected_words:
    i+=1
    if i % 100 == 0:
        print("processed %s words" % i)
        print("calculating clusters for %s" % word)

    # it's more efficient to collect all the vectors for all the layers at once,
    # since we calculate the whole activation network at once for each token
    vectors = []

    
    # create a directory to store all our clustering results in
    data_dir = './data/word_data'
    results_dir = os.path.join(data_dir, word, 'analysis_results')    
    if os.path.exists(results_dir):
        shutil.rmtree(results_dir)
    os.makedirs(results_dir)
    
    # read in the tokens for this word
    pathname = os.path.join(data_dir, word, 'BNC_tokens.csv')
    with open(pathname, mode='r') as csv_file:
        reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=["word", "sentence", "tag", "uid"])
        
        data = [row for row in reader]

        # generate embeddings for each token
        for row in data:
            sentence = row["sentence"]
            vector = bert_helper.get_bert_vectors_for(word, sentence, model, tokenizer)
            # if the token was too long we may not have succeeded in generating embeddings for it, in which case we will throw it out
            if vector != None:
                row["embedding"] = vector
            else:
                row["embedding"] = None
        data = list(filter(lambda row: row["embedding"] != None, data))

        for layer in layers:
            layer_vectors = [row["embedding"][layer] for row in data]
        
            for k in cluster_sizes:
                if len(data) >= k:
                    # calculate clusters
                    kmeans_obj = KMeans(n_clusters=k)
                    kmeans_obj.fit(layer_vectors)
                    label_list = kmeans_obj.labels_
                    cluster_centroids = kmeans_obj.cluster_centers_


                    # store clusternumber with data
                    for index,datapoint in enumerate(data):
                        datapoint['cluster_number'] = label_list[index]

                    # generate outfile name
                    filename = "layer_" + str(layer) + "_clusters_k_equals_" + str(k) + ".csv"
                    outpath = os.path.join(results_dir, filename)


                    with open(outpath, mode='w') as disk:
                        writer = csv.DictWriter(disk, delimiter='\t', fieldnames=['word', 'clusternumber', 'centroid', 'sentence_uids'])


                        # retrieve centroid for each cluster and uids of sentences in cluster:
                        for clusternumber in range(k):
                            sentence_uids = []
                            for index, datapoint in enumerate(data):
                                if datapoint['cluster_number'] == clusternumber:
                                    sentence_uids.append(datapoint['uid'])
                            out_data = {'word': word,
                                        'clusternumber': clusternumber,
                                        'centroid': cluster_centroids[clusternumber],
                                        'sentence_uids': sentence_uids}

                            # store in file
                            # write dta for this cluster
                            writer.writerow(out_data)

                else:
                    print("not enough tokens to make %s clusters for word: %s" % (k, word))




INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/gabriellachronis/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO:pytorch_pretrained_bert.modeling:extracting archive file /Users/gabriellachronis/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/9m/vzvx58rs51v_x5nm620fz4xr0000gn/T/tmpen4lrv07
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 3

tokenized sequence too long
['[CLS]', 'i', 'suppose', 'this', 'is', 'er', '##m', ',', 'a', 'car', '##ica', '##ture', ',', 'a', 'self', 'portrait', 'this', 'little', ',', 'i', ',', 'in', 'fact', 'i', 'was', ',', 'really', 'actually', 'hilarious', 'as', 'i', 'felt', 'that', ',', 'i', ',', 'i', ',', 'do', 'n', "'", 't', 'actually', 'know', 'an', 'awful', 'lot', 'about', 'ga', '##ug', '##in', ',', 'but', 'if', ',', 'if', 'i', 'knew', 'nothing', 'about', 'him', 'at', 'all', ',', 'i', 'would', 'of', 'thought', 'he', 'was', 'having', 'a', 'bit', 'of', 'joke', 'of', 'himself', 'with', 'this', ',', 'but', 'er', ',', 'being', 'the', 'person', 'that', 'he', 'was', 'i', 'ca', 'n', "'", 't', 'image', 'that', 'he', 'had', 'that', 'quality', ',', 'that', ',', 'i', 'do', 'n', "'", 't', 'believe', 'he', 'would', 'be', 'laughing', 'at', 'himself', ',', 'er', '##m', ',', 'er', '##m', ',', 'the', 'symbolism', 'er', '##m', 'and', 'conflict', 'of', 'this', 'painting', 'its', 'da', '##zzle', '##s', 'me', 'mo



tokenized sequence too long
['[CLS]', 'i', 'suppose', 'this', 'is', 'er', '##m', ',', 'a', 'car', '##ica', '##ture', ',', 'a', 'self', 'portrait', 'this', 'little', ',', 'i', ',', 'in', 'fact', 'i', 'was', ',', 'really', 'actually', 'hilarious', 'as', 'i', 'felt', 'that', ',', 'i', ',', 'i', ',', 'do', 'n', "'", 't', 'actually', 'know', 'an', 'awful', 'lot', 'about', 'ga', '##ug', '##in', ',', 'but', 'if', ',', 'if', 'i', 'knew', 'nothing', 'about', 'him', 'at', 'all', ',', 'i', 'would', 'of', 'thought', 'he', 'was', 'having', 'a', 'bit', 'of', 'joke', 'of', 'himself', 'with', 'this', ',', 'but', 'er', ',', 'being', 'the', 'person', 'that', 'he', 'was', 'i', 'ca', 'n', "'", 't', 'image', 'that', 'he', 'had', 'that', 'quality', ',', 'that', ',', 'i', 'do', 'n', "'", 't', 'believe', 'he', 'would', 'be', 'laughing', 'at', 'himself', ',', 'er', '##m', ',', 'er', '##m', ',', 'the', 'symbolism', 'er', '##m', 'and', 'conflict', 'of', 'this', 'painting', 'its', 'da', '##zzle', '##s', 'me', 'mo



tokenized sequence too long
['[CLS]', 'i', 'suppose', 'this', 'is', 'er', '##m', ',', 'a', 'car', '##ica', '##ture', ',', 'a', 'self', 'portrait', 'this', 'little', ',', 'i', ',', 'in', 'fact', 'i', 'was', ',', 'really', 'actually', 'hilarious', 'as', 'i', 'felt', 'that', ',', 'i', ',', 'i', ',', 'do', 'n', "'", 't', 'actually', 'know', 'an', 'awful', 'lot', 'about', 'ga', '##ug', '##in', ',', 'but', 'if', ',', 'if', 'i', 'knew', 'nothing', 'about', 'him', 'at', 'all', ',', 'i', 'would', 'of', 'thought', 'he', 'was', 'having', 'a', 'bit', 'of', 'joke', 'of', 'himself', 'with', 'this', ',', 'but', 'er', ',', 'being', 'the', 'person', 'that', 'he', 'was', 'i', 'ca', 'n', "'", 't', 'image', 'that', 'he', 'had', 'that', 'quality', ',', 'that', ',', 'i', 'do', 'n', "'", 't', 'believe', 'he', 'would', 'be', 'laughing', 'at', 'himself', ',', 'er', '##m', ',', 'er', '##m', ',', 'the', 'symbolism', 'er', '##m', 'and', 'conflict', 'of', 'this', 'painting', 'its', 'da', '##zzle', '##s', 'me', 'mo



tokenized sequence too long
['[CLS]', 'oh', ':', 'car', 'of', 'menace', ',', 'car', 'of', 'b', '##light', 'cars', 'the', 'atmosphere', 'ign', '##ite', 'greenhouse', 'warming', ',', 'havoc', 'forming', 'parkinson', 'must', 'see', 'us', 'right', 'we', "'", 're', 'au', 'fai', '##t', 'with', 'entropy', 'ga', '##ia', ',', 'eco', '##sphere', ',', 'syn', '##ergy', 'words', 'for', 'green', '##ing', 'but', 'their', 'meaning', "'", 's', 'a', 'linguistic', 'mystery', 'oh', ':', 'politicians', 'must', 'in', '##vent', 'worship', 'of', 'environment', 'gen', '##uf', '##le', '##cting', 'by', 'reflecting', 'words', 'of', 've', '##rdan', '##t', 'ba', '##ffle', '##ment', 'now', 'our', 'water', "'", 's', 'unfit', 'to', 'drink', 'too', 'much', 'aluminium', 'and', 'zinc', 'no', 'solution', 'to', 'pollution', 'no', '-', 'one', 'can', 'stand', 'the', 'stink', 'oh', ':', 'public', 'fi', '##lth', 'and', 'pest', '##ile', '##nce', 'highlights', 'private', 'op', '##ule', '##nce', 'does', 'the', 'glitter', ',', 'cl



tokenized sequence too long
['[CLS]', 'bel', '##grave', '##s', 'league', 'division', 'one', ';', 'aston', 'villa', 'three', ',', 'tottenham', 'hotspur', 'two', ';', 'crystal', 'palace', 'two', ',', 'derby', 'county', 'one', ';', 'liverpool', 'two', ',', 'sunderland', 'one', ';', 'luton', 'town', 'ni', '##l', ',', 'norwich', 'city', 'one', ';', 'manchester', 'city', 'one', ',', 'wimbledon', 'one', ';', 'nottingham', 'forest', 'one', ',', 'manchester', 'united', 'one', ';', 'queens', 'park', 'rangers', 'one', ',', 'coventry', 'city', 'ni', '##l', ';', 'sheffield', 'united', 'one', ',', 'chelsea', 'ni', '##l', ';', 'southampton', 'three', ',', 'everton', 'four', ';', 'division', 'two', ';', 'barnsley', 'one', ',', 'charlton', 'athletic', 'one', ';', 'brighton', 'and', 'hove', 'albion', 'one', ',', 'blackburn', 'rovers', 'ni', '##l', ';', 'bristol', 'rovers', 'one', ',', 'not', '##ts', 'county', 'one', ';', 'leicester', 'city', 'four', ',', 'middlesbrough', 'three', ';', 'mil', '##wall', '



tokenized sequence too long
['[CLS]', 'i', 'suppose', 'this', 'is', 'er', '##m', ',', 'a', 'car', '##ica', '##ture', ',', 'a', 'self', 'portrait', 'this', 'little', ',', 'i', ',', 'in', 'fact', 'i', 'was', ',', 'really', 'actually', 'hilarious', 'as', 'i', 'felt', 'that', ',', 'i', ',', 'i', ',', 'do', 'n', "'", 't', 'actually', 'know', 'an', 'awful', 'lot', 'about', 'ga', '##ug', '##in', ',', 'but', 'if', ',', 'if', 'i', 'knew', 'nothing', 'about', 'him', 'at', 'all', ',', 'i', 'would', 'of', 'thought', 'he', 'was', 'having', 'a', 'bit', 'of', 'joke', 'of', 'himself', 'with', 'this', ',', 'but', 'er', ',', 'being', 'the', 'person', 'that', 'he', 'was', 'i', 'ca', 'n', "'", 't', 'image', 'that', 'he', 'had', 'that', 'quality', ',', 'that', ',', 'i', 'do', 'n', "'", 't', 'believe', 'he', 'would', 'be', 'laughing', 'at', 'himself', ',', 'er', '##m', ',', 'er', '##m', ',', 'the', 'symbolism', 'er', '##m', 'and', 'conflict', 'of', 'this', 'painting', 'its', 'da', '##zzle', '##s', 'me', 'mo



tokenized sequence too long
['[CLS]', 'i', 'suppose', 'this', 'is', 'er', '##m', ',', 'a', 'car', '##ica', '##ture', ',', 'a', 'self', 'portrait', 'this', 'little', ',', 'i', ',', 'in', 'fact', 'i', 'was', ',', 'really', 'actually', 'hilarious', 'as', 'i', 'felt', 'that', ',', 'i', ',', 'i', ',', 'do', 'n', "'", 't', 'actually', 'know', 'an', 'awful', 'lot', 'about', 'ga', '##ug', '##in', ',', 'but', 'if', ',', 'if', 'i', 'knew', 'nothing', 'about', 'him', 'at', 'all', ',', 'i', 'would', 'of', 'thought', 'he', 'was', 'having', 'a', 'bit', 'of', 'joke', 'of', 'himself', 'with', 'this', ',', 'but', 'er', ',', 'being', 'the', 'person', 'that', 'he', 'was', 'i', 'ca', 'n', "'", 't', 'image', 'that', 'he', 'had', 'that', 'quality', ',', 'that', ',', 'i', 'do', 'n', "'", 't', 'believe', 'he', 'would', 'be', 'laughing', 'at', 'himself', ',', 'er', '##m', ',', 'er', '##m', ',', 'the', 'symbolism', 'er', '##m', 'and', 'conflict', 'of', 'this', 'painting', 'its', 'da', '##zzle', '##s', 'me', 'mo



tokenized sequence too long
['[CLS]', 'bel', '##grave', '##s', 'league', 'division', 'one', ';', 'aston', 'villa', 'three', ',', 'tottenham', 'hotspur', 'two', ';', 'crystal', 'palace', 'two', ',', 'derby', 'county', 'one', ';', 'liverpool', 'two', ',', 'sunderland', 'one', ';', 'luton', 'town', 'ni', '##l', ',', 'norwich', 'city', 'one', ';', 'manchester', 'city', 'one', ',', 'wimbledon', 'one', ';', 'nottingham', 'forest', 'one', ',', 'manchester', 'united', 'one', ';', 'queens', 'park', 'rangers', 'one', ',', 'coventry', 'city', 'ni', '##l', ';', 'sheffield', 'united', 'one', ',', 'chelsea', 'ni', '##l', ';', 'southampton', 'three', ',', 'everton', 'four', ';', 'division', 'two', ';', 'barnsley', 'one', ',', 'charlton', 'athletic', 'one', ';', 'brighton', 'and', 'hove', 'albion', 'one', ',', 'blackburn', 'rovers', 'ni', '##l', ';', 'bristol', 'rovers', 'one', ',', 'not', '##ts', 'county', 'one', ';', 'leicester', 'city', 'four', ',', 'middlesbrough', 'three', ';', 'mil', '##wall', '



tokenized sequence too long
['[CLS]', 'right', 'so', ',', 'you', 'know', ',', 'there', 'are', 'those', 'who', 'would', 'teach', 'that', 'jesus', 'he', 'would', 'die', 'for', 'our', 'sins', 'and', 'he', "'", 's', 'forgiven', 'us', 'sins', ',', 'but', 'only', 'those', 'who', 'come', 'to', 'him', ',', 'jesus', 'died', 'for', 'the', 'sin', 'of', 'the', 'whole', 'world', ',', 'for', 'every', 'man', ',', 'woman', ',', 'boy', 'and', 'girl', 'that', 'has', 'ever', 'lived', 'or', 'ever', 'will', 'live', ',', 'he', 'died', 'for', 'the', 'sin', 'of', 'the', 'whole', 'world', ',', 'not', 'just', 'for', 'those', 'even', 'who', 'lived', 'after', 'his', 'death', ',', 'that', "'", 's', 'why', 'it', 'talks', 'about', 'in', 'the', 'old', 'testament', 'people', 'like', 'abraham', 'looking', 'for', 'that', 'day', ',', 'and', 'so', 'jesus', 'who', 'in', ',', 'when', 'he', 'died', ',', 'because', 'he', "'", 's', 'eternal', ',', 'so', 'we', "'", 've', 'got', 'the', 'problems', 'with', 'time', ',', 'god', 'ha