### What are the things we care about?

In [4]:
import datasets
import os
import csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import spearmanr
from scipy.spatial.distance import cosine

"""
1) the words we want to collect data for
"""
ws353 = datasets.get_ws353()
simlex999 = datasets.get_simlex999()

# get a list of all the words in ws353
first_word = [row['word1'] for row in ws353]
second_word = [row['word2'] for row in ws353]
ws353_wordlist = first_word + second_word

# get a list of all the words in simlex999
first_word = [row['word1'] for row in simlex999]
second_word = [row['word2'] for row in simlex999]
simlex999_wordlist = first_word + second_word


all_words = ws353_wordlist + simlex999_wordlist
print("Total words between 353 and simlex: %s" % len(all_words))
unique_words = set(all_words) 
print("Unique words between 353 and simlex: %s" % len(unique_words))


"""
2) the layers we want to analzye
"""
layers = [0,1,5,11]

"""
3) The cluster sizes we want to analyze
"""
cluster_sizes = [1,3,5,7]

processed 203 word pairs from WordSim similarity dataset
processed 999 word pairs from simlex999 dataset
Total words between 353 and simlex: 2404
Unique words between 353 and simlex: 1224


### Sort dataset of BNC tokens into files for each word

In [7]:
import os
import csv

ALLWORDS_DIR = './data/word_data'

# you already have tokens collected for each word in simlex and wordsim
# now these tokens ought to be sorted into their own files

# ensure that there is a word_data directory to store in our words
# you have to delete it first with rm -rf if we are reloading
os.mkdir(ALLWORDS_DIR)


# create files for each word we care about
for word in unique_words:
    word_dir = os.path.join(ALLWORDS_DIR, word)
    os.mkdir(word_dir)


# read in the big long file
with open('./data/bnc_tokens_353_and_simlex.csv', mode="r") as infile:
    fieldnames = ["word", "sentence", "POS", "id"]
    reader = csv.DictReader(infile, delimiter="\t", quoting=csv.QUOTE_NONNUMERIC, fieldnames=fieldnames)
    
    # split the big long file into smaller, sorted files that are easier to process one at a time
    for row in reader:
        
        word = row["word"]
        text = row["sentence"]
        pos = row["POS"]
        uid = "BNC_" + str(int(row["id"]))

        # open file for this word to spit tokens into
        token_file = os.path.join(ALLWORDS_DIR, word, "BNC_tokens.csv")
        with open(token_file, mode="a") as outfile:
            # finally, write all of the info with the vector to disk
            writer = writer = csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow([word, text, pos, uid])

In [None]:
### How many instances per word do you have?


In [40]:
import numpy as np
ALLWORDS_DIR = './data/word_data'


token_counts = []
n = 0

for word in unique_words:
    wordfile = os.path.join(ALLWORDS_DIR, word, 'BNC_tokens.csv')   

    with open(wordfile, mode="r") as infile:
        fieldnames = ["word", "sentence", "POS", "id"]
        reader = csv.DictReader(infile, delimiter="\t", quoting=csv.QUOTE_NONNUMERIC, fieldnames=fieldnames)
        
        count = 0
        for row in reader:
            count +=1
        token_counts.append(count)
        n = n+1

average = np.sum(token_counts) / n
print("number of unique words between sl999 and ws353 we've collected tokens for: %s" % n)
print("average number of tokens per word: %s" % average)


number of unique words between sl999 and ws353 we've collected tokens for: 1224
average number of tokens per word: 100.25490196078431


### Calculate the cluster centroids for each word, layer, and k

In [2]:
import os, shutil
import numpy as np
import bert_helper
import csv
from sklearn.cluster import KMeans


"""
for each word file we have, do the following:
    for each layer we care about, calculate the token embedding at that layer for each token
        for each number of clusters we care about, calculate the centroids of those clusters
        
store results in a file, one for each word+layer+cluster_number combo, resulting in a file structure like the following:

word_data/
  |-airplane/
  | |- bnc_tokens.csv
  | |- layer_0_k_1_clusters.csv
  | |   ...
  | |- layer_0_k_7_clusters.csv
  | |   ...
  | |- layer_11_k_7_clusters.csv
  
each cluster file is a csv with the following fields:
    word
    layer
    cluster_size_k
    cluster_number
    centroid
    token_ids

"""

(model, tokenizer) = bert_helper.initialize()

i = 0
for word in unique_words:
    i+=1
    if i % 100 == 0:
        print("processed %s words" % i)
        print("calculating clusters for %s" % word)

    # it's more efficient to collect all the vectors for all the layers at once,
    # since we calculate the whole activation network at once for each token
    vectors = []

    
    # create a directory to store all our clustering results in
    data_dir = './data/word_data'
    results_dir = os.path.join(data_dir, word, 'analysis_results')    
    if os.path.exists(results_dir):
        shutil.rmtree(results_dir)
    os.makedirs(results_dir)
    
    # read in the tokens for this word
    pathname = os.path.join(data_dir, word, 'BNC_tokens.csv')
    with open(pathname, mode='r') as csv_file:
        reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=["word", "sentence", "tag", "uid"])
        
        data = [row for row in reader]

        # generate embeddings for each token
        for row in data:
            sentence = row["sentence"]
            vector = bert_helper.get_bert_vectors_for(word, sentence, model, tokenizer)
            # if the token was too long we may not have succeeded in generating embeddings for it, in which case we will throw it out
            if vector != None:
                row["embedding"] = vector
            else:
                row["embedding"] = None
        data = list(filter(lambda row: row["embedding"] != None, data))

        for layer in layers:
            layer_vectors = [row["embedding"][layer] for row in data]
        
            for k in cluster_sizes:
                if len(data) >= k:
                    # calculate clusters
                    kmeans_obj = KMeans(n_clusters=k)
                    kmeans_obj.fit(layer_vectors)
                    label_list = kmeans_obj.labels_
                    cluster_centroids = kmeans_obj.cluster_centers_


                    # store clusternumber with data
                    for index,datapoint in enumerate(data):
                        datapoint['cluster_number'] = label_list[index]

                    # generate outfile name
                    filename = "layer_" + str(layer) + "_clusters_k_equals_" + str(k) + ".csv"
                    outpath = os.path.join(results_dir, filename)


                    with open(outpath, mode='w') as disk:
                        writer = csv.DictWriter(disk, delimiter='\t', fieldnames=['word', 'clusternumber', 'centroid', 'sentence_uids'])


                        # retrieve centroid for each cluster and uids of sentences in cluster:
                        for clusternumber in range(k):
                            sentence_uids = []
                            for index, datapoint in enumerate(data):
                                if datapoint['cluster_number'] == clusternumber:
                                    sentence_uids.append(datapoint['uid'])
                            out_data = {'word': word,
                                        'clusternumber': clusternumber,
                                        'centroid': cluster_centroids[clusternumber],
                                        'sentence_uids': sentence_uids}

                            # store in file
                            # write dta for this cluster
                            writer.writerow(out_data)

                else:
                    print("not enough tokens to make %s clusters for word: %s" % (k, word))




INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/gabriellachronis/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO:pytorch_pretrained_bert.modeling:extracting archive file /Users/gabriellachronis/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/9m/vzvx58rs51v_x5nm620fz4xr0000gn/T/tmpv164mjyy
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 3

tokenized sequence too long
['[CLS]', 'example', '3', ':', '1', 'limitation', 'on', 'liability', 'of', 'original', 'tenant', '(', '1', ')', 'in', 'this', 'clause', '"', 'the', 'original', 'tenant', '"', 'means', 'the', 'said', '…', 'only', 'and', 'this', 'clause', 'applies', 'to', 'any', 'period', 'after', 'the', 'term', 'here', '##by', 'granted', 'cease', '##s', 'to', 'be', 'vested', 'in', 'the', 'original', 'tenant', '(', '2', ')', 'if', 'and', 'so', 'often', 'as', 'the', 'tenant', 'fails', 'to', 'pay', 'the', 'rent', 'or', 'any', 'other', 'sum', 'properly', 'due', 'under', 'this', 'lease', 'or', 'commits', 'any', 'breach', 'of', 'covenant', 'known', 'to', 'the', 'landlord', 'then', 'the', 'landlord', 'shall', 'forth', '##with', 'not', '##ify', 'the', 'original', 'tenant', 'of', 'that', 'fact', '(', '3', ')', 'the', 'landlord', 'shall', 'not', 'be', 'entitled', 'to', 'recover', 'from', 'the', 'original', 'tenant', 'any', 'ar', '##rea', '##rs', 'of', 'rent', 'or', 'other', 'sums', 'pa



tokenized sequence too long
['[CLS]', 'i', 'suppose', 'this', 'is', 'er', '##m', ',', 'a', 'car', '##ica', '##ture', ',', 'a', 'self', 'portrait', 'this', 'little', ',', 'i', ',', 'in', 'fact', 'i', 'was', ',', 'really', 'actually', 'hilarious', 'as', 'i', 'felt', 'that', ',', 'i', ',', 'i', ',', 'do', 'n', "'", 't', 'actually', 'know', 'an', 'awful', 'lot', 'about', 'ga', '##ug', '##in', ',', 'but', 'if', ',', 'if', 'i', 'knew', 'nothing', 'about', 'him', 'at', 'all', ',', 'i', 'would', 'of', 'thought', 'he', 'was', 'having', 'a', 'bit', 'of', 'joke', 'of', 'himself', 'with', 'this', ',', 'but', 'er', ',', 'being', 'the', 'person', 'that', 'he', 'was', 'i', 'ca', 'n', "'", 't', 'image', 'that', 'he', 'had', 'that', 'quality', ',', 'that', ',', 'i', 'do', 'n', "'", 't', 'believe', 'he', 'would', 'be', 'laughing', 'at', 'himself', ',', 'er', '##m', ',', 'er', '##m', ',', 'the', 'symbolism', 'er', '##m', 'and', 'conflict', 'of', 'this', 'painting', 'its', 'da', '##zzle', '##s', 'me', 'mo



tokenized sequence too long
['[CLS]', 'christ', 'god', 'dealt', 'with', 'the', 'problem', 'which', 'spoiled', 'his', 'image', 'in', 'us', 'and', 'he', 'has', 'to', 'do', 'it', 'because', 'of', 'fundamental', 'thing', ',', 'he', "'", 's', 'got', 'ta', 'do', 'it', 'from', 'the', 'centre', ',', 'you', 'know', 'you', 'can', 'get', 'an', 'apple', ',', 'an', 'ordinary', 'apple', 'and', 'you', 'can', 'polish', 'it', 'up', 'and', 'you', 'can', 'have', 'it', 'so', 'that', 'it', "'", 's', 'bright', 'and', 'glistening', 'and', 'the', 'red', 'is', 'almost', 'you', 'know', 'it', ',', 'it', ',', 'it', ',', 'it', 'almost', 'da', '##zzle', '##s', 'you', 'the', 'shining', 'on', 'it', ',', 'it', "'", 's', 'got', 'a', 'real', 'good', 'polish', 'on', 'the', 'skin', ',', 'but', 'inside', ',', 'there', "'", 's', 'a', 'gr', '##ub', ',', 'and', 'all', 'the', 'polish', '##ing', 'in', 'the', 'world', 'does', 'n', "'", 't', 'get', 'rid', 'of', 'the', 'gr', '##ub', ',', 'and', 'you', 'see', 'that', "'", 's', 'so'



tokenized sequence too long
['[CLS]', 'if', 'we', "'", 've', 'come', 'to', 'guard', 'and', 'received', 'forgiveness', 'of', 'sins', ',', 'if', 'we', 'have', 'become', 'good', 'followers', 'of', 'jesus', 'christ', 'and', 'we', 'are', 'not', 'amazed', 'then', 'there', "'", 's', 'something', 'wrong', 'with', 'what', 'we', "'", 've', 'received', 'that', 'god', 'should', 'so', 'love', ',', 'not', 'just', 'the', 'world', ',', 'but', 'should', 'so', 'love', 'me', ',', 'that', 'he', 'gave', 'his', 'son', 'to', 'die', 'for', 'me', 'and', 'that', 'was', 'the', 'sort', 'of', 'er', 'discovery', 'that', 'these', 'four', 'le', '##pers', 'made', 'they', "'", 've', 'come', 'down', 'there', ',', 'they', "'", 've', 'found', 'that', 'the', 'sight', 'before', 'them', 'was', 'amazing', ',', 'there', 'was', 'no', 'enemy', 'there', ',', 'the', 'enemy', 'had', 'disappeared', 'and', 'the', 'tents', 'with', 'all', 'their', 'contents', 'were', 'there', 'before', 'them', ',', 'they', 'were', 'amazed', 'with', 'wh

not enough tokens to make 5 clusters for word: disorganize
not enough tokens to make 7 clusters for word: disorganize
not enough tokens to make 5 clusters for word: disorganize
not enough tokens to make 7 clusters for word: disorganize
not enough tokens to make 5 clusters for word: disorganize
not enough tokens to make 7 clusters for word: disorganize
not enough tokens to make 5 clusters for word: disorganize
not enough tokens to make 7 clusters for word: disorganize
processed 300 words
calculating clusters for cabbage




tokenized sequence too long
['[CLS]', 'yeah', ',', 'well', 'let', 'me', 'just', 'read', 'you', 'two', 'or', 'three', 'verses', 'from', 'exodus', ',', 'chapter', 'forty', ',', 'this', 'is', 'what', 'it', 'says', 'then', 'the', 'cloud', 'covered', 'the', 'tent', 'of', 'meeting', 'and', 'the', 'glory', 'of', 'the', 'lord', 'filled', 'the', 'tab', '##ern', '##acle', 'and', 'moses', 'was', 'not', 'able', 'to', 'enter', 'the', 'tent', 'of', 'meeting', 'because', 'the', 'cloud', 'had', 'settled', 'on', 'it', 'and', 'the', 'glory', 'of', 'the', 'lord', 'filled', 'the', 'tab', '##ern', '##acle', 'and', 'throughout', 'all', 'their', 'journeys', 'whenever', 'the', 'cloud', 'was', 'taken', 'up', 'from', 'over', 'the', 'tab', '##ern', '##acle', 'the', 'sons', 'of', 'israel', 'would', 'set', 'out', ',', 'but', 'if', 'the', 'cloud', 'was', 'not', 'taken', 'up', 'then', 'they', 'did', 'not', 'set', 'out', 'until', 'the', 'day', 'that', 'it', 'was', 'taken', 'up', ',', 'for', 'throughout', 'all', 'thei



tokenized sequence too long
['[CLS]', 'example', '3', ':', '1', 'limitation', 'on', 'liability', 'of', 'original', 'tenant', '(', '1', ')', 'in', 'this', 'clause', '"', 'the', 'original', 'tenant', '"', 'means', 'the', 'said', '…', 'only', 'and', 'this', 'clause', 'applies', 'to', 'any', 'period', 'after', 'the', 'term', 'here', '##by', 'granted', 'cease', '##s', 'to', 'be', 'vested', 'in', 'the', 'original', 'tenant', '(', '2', ')', 'if', 'and', 'so', 'often', 'as', 'the', 'tenant', 'fails', 'to', 'pay', 'the', 'rent', 'or', 'any', 'other', 'sum', 'properly', 'due', 'under', 'this', 'lease', 'or', 'commits', 'any', 'breach', 'of', 'covenant', 'known', 'to', 'the', 'landlord', 'then', 'the', 'landlord', 'shall', 'forth', '##with', 'not', '##ify', 'the', 'original', 'tenant', 'of', 'that', 'fact', '(', '3', ')', 'the', 'landlord', 'shall', 'not', 'be', 'entitled', 'to', 'recover', 'from', 'the', 'original', 'tenant', 'any', 'ar', '##rea', '##rs', 'of', 'rent', 'or', 'other', 'sums', 'pa



tokenized sequence too long
['[CLS]', 'i', 'suppose', 'this', 'is', 'er', '##m', ',', 'a', 'car', '##ica', '##ture', ',', 'a', 'self', 'portrait', 'this', 'little', ',', 'i', ',', 'in', 'fact', 'i', 'was', ',', 'really', 'actually', 'hilarious', 'as', 'i', 'felt', 'that', ',', 'i', ',', 'i', ',', 'do', 'n', "'", 't', 'actually', 'know', 'an', 'awful', 'lot', 'about', 'ga', '##ug', '##in', ',', 'but', 'if', ',', 'if', 'i', 'knew', 'nothing', 'about', 'him', 'at', 'all', ',', 'i', 'would', 'of', 'thought', 'he', 'was', 'having', 'a', 'bit', 'of', 'joke', 'of', 'himself', 'with', 'this', ',', 'but', 'er', ',', 'being', 'the', 'person', 'that', 'he', 'was', 'i', 'ca', 'n', "'", 't', 'image', 'that', 'he', 'had', 'that', 'quality', ',', 'that', ',', 'i', 'do', 'n', "'", 't', 'believe', 'he', 'would', 'be', 'laughing', 'at', 'himself', ',', 'er', '##m', ',', 'er', '##m', ',', 'the', 'symbolism', 'er', '##m', 'and', 'conflict', 'of', 'this', 'painting', 'its', 'da', '##zzle', '##s', 'me', 'mo



tokenized sequence too long
['[CLS]', 'jesus', 'had', 'had', 'many', 'interviews', 'with', 'people', ',', 'we', "'", 've', 'looked', 'at', 'some', 'of', 'them', 'over', 'these', 'past', 'few', 'weeks', ',', 'the', 'time', 'when', 'he', 'met', 'with', 'nic', '##ade', '##mus', ',', 'the', 'religious', 'leader', ',', 'the', 'time', 'he', 'went', 'out', 'of', 'his', 'way', 'to', 'meet', 'with', 'a', 'woman', 'of', 'se', '##mar', '##ia', 'in', 'her', 'dyer', 'need', ',', 'the', 'other', 'occasion', 'that', 'we', 'looked', 'at', 'er', 'a', 'week', 'or', 'so', 'back', 'when', 'he', 'called', 'an', '##zaki', '##as', 'from', 'that', 'tree', 'of', 'which', 'he', 'was', 'hiding', ',', 'last', 'week', 'his', 'judge', ',', 'pilot', ',', 'but', 'of', 'all', 'those', 'interviews', 'and', 'as', 'many', 'others', 'that', 'we', 'have', 'n', "'", 't', 'looked', 'at', 'this', 'surely', 'must', 'be', 'one', 'of', 'the', 'strange', '##st', 'as', 'jesus', 'himself', 'is', 'in', 'the', 'process', 'of', 'dying



tokenized sequence too long
['[CLS]', 'over', 'these', 'er', '##m', 'past', 'couple', 'of', 'weeks', 'we', "'", 've', 'been', 'looking', 'at', 'er', 'some', 'of', 'the', 'questions', 'in', 'the', 'new', 'testament', ',', 'we', 'thought', 'a', 'couple', 'of', 'weeks', 'back', 'of', 'the', 'question', 'that', 'jesus', 'asked', 'his', 'disciples', ',', 'do', 'you', 'think', 'i', "'", 'm', 'able', 'to', 'do', 'this', 'and', 'then', 'last', 'week', 'we', 'looked', 'at', 'a', 'question', 'that', 'the', 'disciples', 'put', 'to', 'jesus', ',', 'that', 'time', 'when', 'they', 'came', 'down', 'from', 'the', 'mountain', 'and', 'they', 'found', 'the', 're', ',', 'three', 'of', 'them', 'came', 'down', 'with', 'jesus', 'from', 'the', 'mountain', 'of', 'trans', '##fi', '##gur', '##ation', 'and', 'they', 'found', 'the', 'other', 'disciples', 'with', 'a', 'man', 'who', 'and', 'a', ',', 'whose', 'son', 'was', 'demon', 'possessed', 'and', 'er', 'they', 'had', 'been', 'unable', 'to', 'help', 'him', 'and',



tokenized sequence too long
['[CLS]', 'as', 'i', 'mentioned', 'early', 'the', ',', 'the', 'city', 'of', 'ser', '##mar', '##ia', 'it', 'was', 'under', 'siege', 'and', 'the', 'army', 'of', 'ser', '##ia', 'was', 'en', '##camp', '##ed', 'all', 'around', 'it', ',', 'ben', 'had', '##ad', 'was', 'a', 'great', 'warrior', ',', 'he', 'would', 'of', 'been', 'the', ',', 'the', 'alexander', 'or', 'the', 'napoleon', 'of', 'his', 'day', 'and', 'he', 'had', 'set', 'up', 'this', 'en', '##camp', '##ment', 'around', 'the', 'city', 'of', 'ser', '##mar', '##ia', ',', 'nobody', 'could', 'get', 'in', ',', 'nobody', 'could', 'get', 'out', 'and', 'very', 'quickly', 'the', 'stocks', 'of', 'food', 'and', 'water', 'er', 'were', 'used', 'up', ',', 'ratio', '##ning', 'would', 'of', 'been', 'introduced', 'but', 'it', 'only', 'lasted', 'for', 'a', 'certain', 'period', ',', 'they', "'", 'd', 'got', 'to', 'the', 'stage', 'it', 'tells', 'us', 'in', 'the', 'previous', 'chapter', 'that', 'er', ',', 'that', 'a', 'donkey', 



tokenized sequence too long
['[CLS]', 'jesus', 'had', 'had', 'many', 'interviews', 'with', 'people', ',', 'we', "'", 've', 'looked', 'at', 'some', 'of', 'them', 'over', 'these', 'past', 'few', 'weeks', ',', 'the', 'time', 'when', 'he', 'met', 'with', 'nic', '##ade', '##mus', ',', 'the', 'religious', 'leader', ',', 'the', 'time', 'he', 'went', 'out', 'of', 'his', 'way', 'to', 'meet', 'with', 'a', 'woman', 'of', 'se', '##mar', '##ia', 'in', 'her', 'dyer', 'need', ',', 'the', 'other', 'occasion', 'that', 'we', 'looked', 'at', 'er', 'a', 'week', 'or', 'so', 'back', 'when', 'he', 'called', 'an', '##zaki', '##as', 'from', 'that', 'tree', 'of', 'which', 'he', 'was', 'hiding', ',', 'last', 'week', 'his', 'judge', ',', 'pilot', ',', 'but', 'of', 'all', 'those', 'interviews', 'and', 'as', 'many', 'others', 'that', 'we', 'have', 'n', "'", 't', 'looked', 'at', 'this', 'surely', 'must', 'be', 'one', 'of', 'the', 'strange', '##st', 'as', 'jesus', 'himself', 'is', 'in', 'the', 'process', 'of', 'dying



tokenized sequence too long
['[CLS]', 'as', 'i', 'mentioned', 'early', 'the', ',', 'the', 'city', 'of', 'ser', '##mar', '##ia', 'it', 'was', 'under', 'siege', 'and', 'the', 'army', 'of', 'ser', '##ia', 'was', 'en', '##camp', '##ed', 'all', 'around', 'it', ',', 'ben', 'had', '##ad', 'was', 'a', 'great', 'warrior', ',', 'he', 'would', 'of', 'been', 'the', ',', 'the', 'alexander', 'or', 'the', 'napoleon', 'of', 'his', 'day', 'and', 'he', 'had', 'set', 'up', 'this', 'en', '##camp', '##ment', 'around', 'the', 'city', 'of', 'ser', '##mar', '##ia', ',', 'nobody', 'could', 'get', 'in', ',', 'nobody', 'could', 'get', 'out', 'and', 'very', 'quickly', 'the', 'stocks', 'of', 'food', 'and', 'water', 'er', 'were', 'used', 'up', ',', 'ratio', '##ning', 'would', 'of', 'been', 'introduced', 'but', 'it', 'only', 'lasted', 'for', 'a', 'certain', 'period', ',', 'they', "'", 'd', 'got', 'to', 'the', 'stage', 'it', 'tells', 'us', 'in', 'the', 'previous', 'chapter', 'that', 'er', ',', 'that', 'a', 'donkey', 



tokenized sequence too long
['[CLS]', 'as', 'i', 'mentioned', 'early', 'the', ',', 'the', 'city', 'of', 'ser', '##mar', '##ia', 'it', 'was', 'under', 'siege', 'and', 'the', 'army', 'of', 'ser', '##ia', 'was', 'en', '##camp', '##ed', 'all', 'around', 'it', ',', 'ben', 'had', '##ad', 'was', 'a', 'great', 'warrior', ',', 'he', 'would', 'of', 'been', 'the', ',', 'the', 'alexander', 'or', 'the', 'napoleon', 'of', 'his', 'day', 'and', 'he', 'had', 'set', 'up', 'this', 'en', '##camp', '##ment', 'around', 'the', 'city', 'of', 'ser', '##mar', '##ia', ',', 'nobody', 'could', 'get', 'in', ',', 'nobody', 'could', 'get', 'out', 'and', 'very', 'quickly', 'the', 'stocks', 'of', 'food', 'and', 'water', 'er', 'were', 'used', 'up', ',', 'ratio', '##ning', 'would', 'of', 'been', 'introduced', 'but', 'it', 'only', 'lasted', 'for', 'a', 'certain', 'period', ',', 'they', "'", 'd', 'got', 'to', 'the', 'stage', 'it', 'tells', 'us', 'in', 'the', 'previous', 'chapter', 'that', 'er', ',', 'that', 'a', 'donkey', 

processed 900 words
calculating clusters for bizarre




tokenized sequence too long
['[CLS]', 'i', 'suppose', 'this', 'is', 'er', '##m', ',', 'a', 'car', '##ica', '##ture', ',', 'a', 'self', 'portrait', 'this', 'little', ',', 'i', ',', 'in', 'fact', 'i', 'was', ',', 'really', 'actually', 'hilarious', 'as', 'i', 'felt', 'that', ',', 'i', ',', 'i', ',', 'do', 'n', "'", 't', 'actually', 'know', 'an', 'awful', 'lot', 'about', 'ga', '##ug', '##in', ',', 'but', 'if', ',', 'if', 'i', 'knew', 'nothing', 'about', 'him', 'at', 'all', ',', 'i', 'would', 'of', 'thought', 'he', 'was', 'having', 'a', 'bit', 'of', 'joke', 'of', 'himself', 'with', 'this', ',', 'but', 'er', ',', 'being', 'the', 'person', 'that', 'he', 'was', 'i', 'ca', 'n', "'", 't', 'image', 'that', 'he', 'had', 'that', 'quality', ',', 'that', ',', 'i', 'do', 'n', "'", 't', 'believe', 'he', 'would', 'be', 'laughing', 'at', 'himself', ',', 'er', '##m', ',', 'er', '##m', ',', 'the', 'symbolism', 'er', '##m', 'and', 'conflict', 'of', 'this', 'painting', 'its', 'da', '##zzle', '##s', 'me', 'mo

### Now we will evaluate each combination of layer and num_clusters against similarity gold standards

In [13]:
"""
In the end we want two data structures that looks like this:

layer k_clusters ws353_pearson p ws353_spearman p ws353_n simlex_pearson p  simlex_spearman p  simlex_n
0     1          .77             .73              200     .54              .49                 988
....  ....       ....
0     7          .88             .80              180     .54              .65                 950
1     1          ....
...   ....       ....
11    7          ....



"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.spatial.distance import cosine



results_file = './data/bnc_cluster_analysis_ws353_similarity_results.csv'
fieldnames = ['layer', 'k_clusters', 'pearson', 'pearson_P', 'spearman', 'spearman_P', 'N']
with open(results_file, mode='w') as disk:
    writer = csv.DictWriter(disk, delimiter='\t', fieldnames=fieldnames)
    
    
    for layer_number in layers:
        for k in cluster_sizes:
            
            # calc sim for all the word pairs
            data = ws353
            expected_similarities = []
            for row in data:
                word1 = row['word1']
                word2 = row['word2']
                observed_similarity = row['similarity']

                # get centroid data for these words at this layer and this k size
                pairwise_centroids = {}
                for word in [word1, word2]:
                    cluster_filename = "layer_" + str(layer_number) + "_clusters_k_equals_" + str(k) + ".csv"
                    cluster_path = os.path.join('./data/word_data/', word, 'analysis_results', cluster_filename)
                    with open(cluster_path, mode='r') as csv_file:
                        fieldnames = ['word', 'clusternumber', 'centroid', 'sentence_uids']
                        reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=fieldnames)

                        word_centroids = []
                        for line in reader:
                            centroid = np.fromstring(line['centroid'][2:-2], dtype=np.float, sep=' ')
                            word_centroids.append(centroid)
                        pairwise_centroids[word] = word_centroids



                # calculate maxsim
                # calculate predicted similarity from of each pair of cluster centroids of both words
                predicted_similarities = []
                for centroid1 in pairwise_centroids[word1]:
                    for centroid2 in pairwise_centroids[word2]:
                        predicted_similarity = 1 - cosine(centroid1, centroid2)
                        predicted_similarities.append(predicted_similarity)
                # find the max of the pairwise similarities
                max_sim = max(predicted_similarities)

                row['predicted_similarity'] = max_sim

            # create data frame 
            df = pd.DataFrame.from_records(data)
            X = df['predicted_similarity']
            y = df['similarity']

            # run pearson expected vs observed
            pearson_value = pearsonr(X,y)

            # run spearman expected vs observed
            spearman_value = spearmanr(X,y)


            # save results to file
            output = {'layer': layer_number,
                      'k_clusters': k,
                      'pearson': pearson_value[0],
                      'pearson_P': pearson_value[1],
                      'spearman': spearman_value[0],
                      'spearman_P': spearman_value[1],
                      'N': len(df)
                     }
            writer.writerow(output)
            
            
            



### Do the same for simlex999

In [19]:
results_file = './data/bnc_cluster_analysis_simlex999_similarity_results.csv'
fieldnames = ['layer', 'k_clusters', 'pearson', 'pearson_P', 'spearman', 'spearman_P', 'N']
with open(results_file, mode='w') as disk:
    writer = csv.DictWriter(disk, delimiter='\t', fieldnames=fieldnames)
    
    
    for layer_number in layers:
        for k in cluster_sizes:
            
            # calc sim for all the word pairs
            data = simlex999
            expected_similarities = []
            for row in data:
                word1 = row['word1']
                word2 = row['word2']
                observed_similarity = row['SimLex999']

                # get centroid data for these words at this layer and this k size
                pairwise_centroids = {}
                for word in [word1, word2]:
                    cluster_filename = "layer_" + str(layer_number) + "_clusters_k_equals_" + str(k) + ".csv"
                    cluster_path = os.path.join('./data/word_data/', word, 'analysis_results', cluster_filename)
                    
                    try:
                        with open(cluster_path, mode='r') as csv_file:
                            fieldnames = ['word', 'clusternumber', 'centroid', 'sentence_uids']
                            reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=fieldnames)

                            word_centroids = []
                            for line in reader:
                                centroid = np.fromstring(line['centroid'][2:-2], dtype=np.float, sep=' ')
                                word_centroids.append(centroid)
                            pairwise_centroids[word] = word_centroids
                    except:
                        print("can't calculate predicted similarity for pair %s, %s" %(word1, word2))
                        print("   no tokens collected for %s" % word)



                # calculate maxsim
                # calculate predicted similarity from of each pair of cluster centroids of both words
                # only if we have centroids for both words
                if (word1 in pairwise_centroids) and (word2 in pairwise_centroids):
                    predicted_similarities = []
                    for centroid1 in pairwise_centroids[word1]:
                        for centroid2 in pairwise_centroids[word2]:
                            predicted_similarity = 1 - cosine(centroid1, centroid2)
                            predicted_similarities.append(predicted_similarity)
                    # find the max of the pairwise similarities
                    max_sim = max(predicted_similarities)

                    row['predicted_similarity'] = max_sim
                else:
                    row['predicted_similarity'] = None
            
            # remove pairs from consideration for which we have no predicted similarity to compare
            data = list(filter(lambda row: row['predicted_similarity'] != None, data))


            # create data frame 
            df = pd.DataFrame.from_records(data)
            X = df['predicted_similarity']
            y = df['SimLex999']

            # run pearson expected vs observed
            pearson_value = pearsonr(X,y)

            # run spearman expected vs observed
            spearman_value = spearmanr(X,y)


            # save results to file
            output = {'layer': layer_number,
                      'k_clusters': k,
                      'pearson': pearson_value[0],
                      'pearson_P': pearson_value[1],
                      'spearman': spearman_value[0],
                      'spearman_P': spearman_value[1],
                      'N': len(df)
                     }
            writer.writerow(output)

can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similari

### Try avgsim

In [28]:
"""
In the end we want two data structures that looks like this:

layer k_clusters ws353_pearson p ws353_spearman p ws353_n simlex_pearson p  simlex_spearman p  simlex_n
0     1          .77             .73              200     .54              .49                 988
....  ....       ....
0     7          .88             .80              180     .54              .65                 950
1     1          ....
...   ....       ....
11    7          ....



"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import spearmanr
from scipy.spatial.distance import cosine



results_file = './data/bnc_cluster_analysis_ws353_avgsim_similarity_results.csv'
fieldnames = ['layer', 'k_clusters', 'pearson', 'pearson_P', 'spearman', 'spearman_P', 'N']
with open(results_file, mode='w') as disk:
    writer = csv.DictWriter(disk, delimiter='\t', fieldnames=fieldnames)
    
    
    for layer_number in layers:
        for k in cluster_sizes:
            
            # calc sim for all the word pairs
            data = ws353
            expected_similarities = []
            for row in data:
                word1 = row['word1']
                word2 = row['word2']
                observed_similarity = row['similarity']

                # get centroid data for these words at this layer and this k size
                pairwise_centroids = {}
                for word in [word1, word2]:
                    cluster_filename = "layer_" + str(layer_number) + "_clusters_k_equals_" + str(k) + ".csv"
                    cluster_path = os.path.join('./data/word_data/', word, 'analysis_results', cluster_filename)
                    with open(cluster_path, mode='r') as csv_file:
                        fieldnames = ['word', 'clusternumber', 'centroid', 'sentence_uids']
                        reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=fieldnames)

                        word_centroids = []
                        for line in reader:
                            centroid = np.fromstring(line['centroid'][2:-2], dtype=np.float, sep=' ')
                            word_centroids.append(centroid)
                        pairwise_centroids[word] = word_centroids



                # calculate maxsim
                # calculate predicted similarity from of each pair of cluster centroids of both words
                predicted_similarities = []
                for centroid1 in pairwise_centroids[word1]:
                    for centroid2 in pairwise_centroids[word2]:
                        predicted_similarity = 1 - cosine(centroid1, centroid2)
                        predicted_similarities.append(predicted_similarity)
                # find the max of the pairwise similarities
                avg_sim = np.sum(predicted_similarities) / k*k

                row['predicted_similarity'] = avg_sim

            # create data frame 
            df = pd.DataFrame.from_records(data)
            X = df['predicted_similarity']
            y = df['similarity']

            # run pearson expected vs observed
            pearson_value = pearsonr(X,y)

            # run spearman expected vs observed
            spearman_value = spearmanr(X,y)


            # save results to file
            output = {'layer': layer_number,
                      'k_clusters': k,
                      'pearson': pearson_value[0],
                      'pearson_P': pearson_value[1],
                      'spearman': spearman_value[0],
                      'spearman_P': spearman_value[1],
                      'N': len(df)
                     }
            writer.writerow(output)
            
            
results_file = './data/bnc_cluster_analysis_simlex999_avgsim_similarity_results.csv'
fieldnames = ['layer', 'k_clusters', 'pearson', 'pearson_P', 'spearman', 'spearman_P', 'N']
with open(results_file, mode='w') as disk:
    writer = csv.DictWriter(disk, delimiter='\t', fieldnames=fieldnames)
    
    
    for layer_number in layers:
        for k in cluster_sizes:
            
            # calc sim for all the word pairs
            data = simlex999
            expected_similarities = []
            for row in data:
                word1 = row['word1']
                word2 = row['word2']
                observed_similarity = row['SimLex999']

                # get centroid data for these words at this layer and this k size
                pairwise_centroids = {}
                for word in [word1, word2]:
                    cluster_filename = "layer_" + str(layer_number) + "_clusters_k_equals_" + str(k) + ".csv"
                    cluster_path = os.path.join('./data/word_data/', word, 'analysis_results', cluster_filename)
                    
                    try:
                        with open(cluster_path, mode='r') as csv_file:
                            fieldnames = ['word', 'clusternumber', 'centroid', 'sentence_uids']
                            reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=fieldnames)

                            word_centroids = []
                            for line in reader:
                                centroid = np.fromstring(line['centroid'][2:-2], dtype=np.float, sep=' ')
                                word_centroids.append(centroid)
                            pairwise_centroids[word] = word_centroids
                    except:
                        print("can't calculate predicted similarity for pair %s, %s" %(word1, word2))
                        print("   no tokens collected for %s" % word)



                # calculate maxsim
                # calculate predicted similarity from of each pair of cluster centroids of both words
                # only if we have centroids for both words
                if (word1 in pairwise_centroids) and (word2 in pairwise_centroids):
                    predicted_similarities = []
                    for centroid1 in pairwise_centroids[word1]:
                        for centroid2 in pairwise_centroids[word2]:
                            predicted_similarity = 1 - cosine(centroid1, centroid2)
                            predicted_similarities.append(predicted_similarity)
                    # find the max of the pairwise similarities
                    avg_sim = np.sum(predicted_similarities) / k*k

                    row['predicted_similarity'] = avg_sim
                else:
                    row['predicted_similarity'] = None
            
            # remove pairs from consideration for which we have no predicted similarity to compare
            data = list(filter(lambda row: row['predicted_similarity'] != None, data))


            # create data frame 
            df = pd.DataFrame.from_records(data)
            X = df['predicted_similarity']
            y = df['SimLex999']

            # run pearson expected vs observed
            pearson_value = pearsonr(X,y)

            # run spearman expected vs observed
            spearman_value = spearmanr(X,y)


            # save results to file
            output = {'layer': layer_number,
                      'k_clusters': k,
                      'pearson': pearson_value[0],
                      'pearson_P': pearson_value[1],
                      'spearman': spearman_value[0],
                      'spearman_P': spearman_value[1],
                      'N': len(df)
                     }
            writer.writerow(output)            



can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similari

In [5]:
#UMMM have you been doing this wrong with cosine distance when you should have been using similarity?

results_file = './data/bnc_cluster_analysis_simlex999_similarity_results_cossim.csv'
fieldnames = ['layer', 'k_clusters', 'pearson', 'pearson_P', 'spearman', 'spearman_P', 'N']
with open(results_file, mode='w') as disk:
    writer = csv.DictWriter(disk, delimiter='\t', fieldnames=fieldnames)
    
    
    for layer_number in layers:
        for k in cluster_sizes:
            
            # calc sim for all the word pairs
            data = simlex999
            expected_similarities = []
            for row in data:
                word1 = row['word1']
                word2 = row['word2']
                observed_similarity = row['SimLex999']

                # get centroid data for these words at this layer and this k size
                pairwise_centroids = {}
                for word in [word1, word2]:
                    cluster_filename = "layer_" + str(layer_number) + "_clusters_k_equals_" + str(k) + ".csv"
                    cluster_path = os.path.join('./data/word_data/', word, 'analysis_results', cluster_filename)
                    
                    try:
                        with open(cluster_path, mode='r') as csv_file:
                            fieldnames = ['word', 'clusternumber', 'centroid', 'sentence_uids']
                            reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=fieldnames)

                            word_centroids = []
                            for line in reader:
                                centroid = np.fromstring(line['centroid'][2:-2], dtype=np.float, sep=' ')
                                word_centroids.append(centroid)
                            pairwise_centroids[word] = word_centroids
                    except:
                        print("can't calculate predicted similarity for pair %s, %s" %(word1, word2))
                        print("   no tokens collected for %s" % word)



                # calculate maxsim
                # calculate predicted similarity from of each pair of cluster centroids of both words
                # only if we have centroids for both words
                if (word1 in pairwise_centroids) and (word2 in pairwise_centroids):
                    predicted_similarities = []
                    for centroid1 in pairwise_centroids[word1]:
                        for centroid2 in pairwise_centroids[word2]:
                            #predicted_similarity = 1 - cosine(centroid1, centroid2)
                            predicted_similarity = cosine(centroid1, centroid2)
                            predicted_similarities.append(predicted_similarity)
                    # find the max of the pairwise similarities
                    max_sim = max(predicted_similarities)

                    row['predicted_similarity'] = max_sim
                else:
                    row['predicted_similarity'] = None
            
            # remove pairs from consideration for which we have no predicted similarity to compare
            data = list(filter(lambda row: row['predicted_similarity'] != None, data))


            # create data frame 
            df = pd.DataFrame.from_records(data)
            X = df['predicted_similarity']
            y = df['SimLex999']

            # run pearson expected vs observed
            pearson_value = pearsonr(X,y)

            # run spearman expected vs observed
            spearman_value = spearmanr(X,y)


            # save results to file
            output = {'layer': layer_number,
                      'k_clusters': k,
                      'pearson': pearson_value[0],
                      'pearson_P': pearson_value[1],
                      'spearman': spearman_value[0],
                      'spearman_P': spearman_value[1],
                      'N': len(df)
                     }
            writer.writerow(output)

can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similarity for pair orthodontist, dentist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair doctor, orthodontist
   no tokens collected for orthodontist
can't calculate predicted similarity for pair disorganize, organize
   no tokens collected for disorganize
can't calculate predicted similari