### Load BNC

In [1]:
import datasets

b = datasets.get_bnc()
b.tagged_sents()[:2]


[[('FACTSHEET', 'SUBST'),
  ('WHAT', 'PRON'),
  ('IS', 'VERB'),
  ('AIDS', 'SUBST'),
  ('?', 'PUN')],
 [('AIDS', 'SUBST'),
  ('(', 'PUL'),
  ('Acquired', 'VERB'),
  ('Immune', 'ADJ'),
  ('Deficiency', 'SUBST'),
  ('Syndrome', 'SUBST'),
  (')', 'PUR'),
  ('is', 'VERB'),
  ('a', 'ART'),
  ('condition', 'SUBST'),
  ('caused', 'VERB'),
  ('by', 'PREP'),
  ('a', 'ART'),
  ('virus', 'SUBST'),
  ('called', 'VERB'),
  ('HIV', 'SUBST'),
  ('(', 'PUL'),
  ('Human', 'ADJ'),
  ('Immuno', 'SUBST'),
  ('Deficiency', 'SUBST'),
  ('Virus', 'SUBST'),
  (')', 'PUR'),
  ('.', 'PUN')]]

### Load WS353

In [2]:
ws353 = datasets.get_ws353()

ws353[100]

processed 203 word pairs from WordSim similarity dataset


OrderedDict([('word1', 'monk'), ('word2', 'oracle'), ('similarity', 5.0)])

### Load Simlex-999

In [3]:
simlex999 = datasets.get_simlex999()

simlex999[100]

processed 999 word pairs from simlex999 dataset


OrderedDict([('word1', 'bold'),
             ('word2', 'proud'),
             ('POS', 'A'),
             ('SimLex999', 3.97),
             ('conc_w1', '2.43'),
             ('conc_w2', '2.07'),
             ('concQ', '1'),
             ('assoc_USF', '0.12'),
             ('sim_assoc333', '0'),
             ('SD_simlex', '1.36')])

In [4]:
# get a list of all the words in ws353
first_word = [row['word1'] for row in ws353]
second_word = [row['word2'] for row in ws353]
ws353_wordlist = first_word + second_word

ws353_wordlist 

# get a list of all the words in simlex999
first_word = [row['word1'] for row in simlex999]
second_word = [row['word2'] for row in simlex999]
simlex999_wordlist = first_word + second_word

all_words = ws353_wordlist + simlex999_wordlist
print("Num words in datasets: %s" % len(all_words))

unique_words = set(all_words) 
print("Number of unique words in simlex + wordsim: %s" % len(unique_words))
unique_words

Num words in datasets: 2404
Number of unique words in simlex + wordsim: 1224


{'legion',
 'modern',
 'fence',
 'foot',
 'evening',
 'attempt',
 'decide',
 'might',
 'eye',
 'flour',
 'big',
 'intuition',
 'impression',
 'denial',
 'breakfast',
 'minute',
 'key',
 'student',
 'army',
 'jaguar',
 'mustard',
 'bold',
 'swamp',
 'intelligent',
 'profession',
 'succeed',
 'cheap',
 'navy',
 'bathroom',
 'new',
 'creator',
 'island',
 'monster',
 'stream',
 'live',
 'large',
 'bar',
 'image',
 'disc',
 'lad',
 'cloth',
 'wheat',
 'bishop',
 'drizzle',
 'ceremony',
 'pursue',
 'rail',
 'boundary',
 'reef',
 'course',
 'please',
 'sentry',
 'gain',
 'ox',
 'collection',
 'basketball',
 'storm',
 'purse',
 'sign',
 'fabric',
 'serial',
 'egg',
 'trick',
 'dad',
 'food',
 'style',
 'game',
 'summer',
 'vessel',
 'honey',
 'chance',
 'assume',
 'week',
 'wonderful',
 'odd',
 'delay',
 'create',
 'forest',
 'kind',
 'buck',
 'fashion',
 'sunshine',
 'reality',
 'violin',
 'emergency',
 'possess',
 'scheme',
 'contemplate',
 'whiskey',
 'dawn',
 'sunset',
 'comprehend',
 'pr

### Get token contexts for BNC words

In [5]:
def randomly(seq, pseudo=True):
    import random
    shuffled = list(seq)  
    if pseudo:
        seed = lambda : 0.479032895084095295148903189394529083928435389203890819038471
        random.shuffle(shuffled, seed)
    else:
        print("shuffling indexes")
        random.shuffle(shuffled) 
        print("done shuffling")
    return list(shuffled)

def bnc_length(pathname='./data/count_of_bnc_sentences.txt'):
    try:
        with open(pathname, 'r') as fh:
            count = int(fh.read())
            return count
    except:
        print("BNC not yet indexed. Calculating length and writing to 'data/count_of_bnc_sentences.txt'")
        bnc_reader = load_bnc()
        corpus = bnc_reader.tagged_sents(strip_space=True)
        length = len(corpus)
        with open(pathname, 'w') as disk:
            disk.write(str(length))
        return length

    
"""
TODO problem: all of the common words are going to be super similar 
    to each other because they are collected early on in the same sentences
"""

def collect_bnc_examples(words, max_num_examples=100, override=False):
    import os.path
    import csv
    
    filename = 'bnc_tokens_353_and_simlex.csv'
    parent_dir = './data'
    pathname = os.path.join(parent_dir, filename)  
    
    # do we already have the data collected?
    if os.path.isfile(pathname) and override==False:
        print("data already exist at %s" % pathname)
        return
    
    else:    
        bnc_reader = datasets.get_bnc()
        corpus = bnc_reader.tagged_sents(strip_space=True)
        corpus_length = bnc_length()
        print("# Sentences in BNC corpus: %s" % corpus_length)

        
        with open(pathname, mode='w') as outfile:
            writer = csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)
            
            # create a data structure for keeping tabs on how many tokens we have collected
            unigrams = {}
            for word in unique_words:
                    unigrams[word]=max_num_examples                    
            
            # come up with a random order in which to traverse the BNC
            randomized_indexes = randomly([x for x in range(corpus_length)], pseudo=False)
            print(randomized_indexes[:50])
            
            
            """"
            Iterate through the corpus, looking at words one by one, and 
            keep iterating as long as we still have tokens to collect
            """
            i = 0
            
            while (unigrams and randomized_indexes):
                # track progress
                i+=1
                if i % 100000 == 0:
                    print("Processed %s sentences" % i)
                
                # fetch the next random sentence
                corpus_index = randomized_indexes.pop()
                sentence = corpus[corpus_index]
               
            
                # keep track of words we've seen in this sentence, so we don't collect
                # a word twice if it appears twice in the sentence. 
                seen_words = set()
                
                for word_tuple in sentence:
                    word = word_tuple[0].lower()
                    tag = word_tuple[1]

                    token_count = unigrams.get(word) 
                    
                    # collect this sentence as a token of the word
                    if (token_count != None) and (word not in seen_words):

                        string = ' '.join([w[0] for w in sentence])
                        
                        if i % 100000 == 0:
                            print(word)
                            print(tag)
                            print(string)
                            print(corpus_index)
                        
                        writer.writerow([word, string, tag, corpus_index])
                        seen_words.add(word)
                        if unigrams[word]==0:
                            del unigrams[word]
                        else:
                            unigrams[word] -=1



bnc_length()

collect_bnc_examples(unique_words, override = True)

# Sentences in BNC corpus: 6026276
shuffling indexes
done shuffling
[3507019, 581929, 4058133, 3148070, 4115605, 1325049, 4213575, 5166636, 2626864, 387970, 3512371, 5673613, 3848123, 5112398, 2338855, 3423827, 2387587, 5129218, 2255525, 4459182, 3709436, 43254, 861312, 868118, 5760421, 4277481, 4776066, 5102187, 450959, 4285010, 3088179, 5261823, 2510741, 5033797, 4949842, 3459945, 3511474, 3439804, 2985856, 5928711, 4895678, 321603, 2953841, 4542068, 4664671, 1516488, 248790, 5671886, 5477120, 2531623]


KeyboardInterrupt: 

### Get some stats on the data

In [None]:
# What's the average number of examples for each word?
#os.system("awk 'BEGIN {FS=OFS="\t"} NR > 0 {a[$1]+=1} END {for (i in a) {print i, a[i]}}' bnc_words_with_context_tokens.txt | sort -n >> bnc_counts.txt")

In [None]:
import csv
import matplotlib
import seaborn as sns, pandas as pd
import matplotlib.pyplot as plt

#############
# How many do we keep if we want 20 examples minimum? 10?
##############
with open('./data/similarity_dataset/words_with_BNC_tokens.csv', mode="r") as infile:
    reader = csv.DictReader(infile, delimiter="\t", quoting=csv.QUOTE_NONNUMERIC, fieldnames=["word", "sentence", "POS", "id"])
    #token_counts = {row[0]:int(row[1]) for row in reader}
    
    data = [row for row in reader]
    # print(data[:10])
    
    token_counts = {}
    for row in data:
        word = row["word"]
        if word in token_counts:
            token_counts[word] += 1
        else:
            token_counts[word] = 0
    
    fifty = [(x, y) for (x,y) in token_counts.items() if y >=50]
    over_twenty = {x: y for (x,y) in token_counts.items() if y >=20}
    over_ten = [[x, y] for (x,y) in token_counts.items() if y >=10]
    print("50 tokens for %s words" %len(fifty))
    print("Over 20 tokens for %s words" %len(over_twenty))
    print("Over 10 tokens for %s words" %len(over_ten))

################
# What's the relationship betweeen abstractness and frequency in the corpus? We want this to be relatively even, I think....
################

df = pd.DataFrame.from_records(fifty, columns=["word", "token_count"])
print(df)
print(df.columns)

# Cut the window in 2 parts
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})

# Add a graph in each part
sns.boxplot(df["token_count"], ax=ax_box)
sns.distplot(df["token_count"], ax=ax_hist)

 
# Remove x axis name for the boxplot
ax_box.set(xlabel='')

### Load pretrained BERT model 

In [None]:
#!pip install pytorch-pretrained-bert

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
#% matplotlib inline


# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


#### Save token vectors

In [None]:
# bert base uncased
import numpy as np
import os.path

# function to calculate token vector for a word in a context
# in case the word parses into multiple word pieces, take the average of the context vectors for each piece

# calculate token vectors for each example of each word in dataset


with open('./data/similarity_dataset/words_with_BNC_tokens.csv', mode="r") as infile:
    fieldnames = ["word", "sentence", "POS", "id"]
    reader = csv.DictReader(infile, delimiter="\t", quoting=csv.QUOTE_NONNUMERIC, fieldnames=fieldnames)

    for row in reader:
        word = row["word"]
        text = row["sentence"]
        pos = row["POS"]
        uid = "BNC" + str(int(row["id"]))

        # ensure directory exists for this word
        data_dir = "./data/similarity_dataset"
        filename = word
        word_dir = os.path.join(data_dir, filename)
        try:
            os.mkdir(word_dir)
            print("making directory for word at %s" % word_dir)
        except:
            None

        # open file for this word to spit vector into
        vector_file = os.path.join(word_dir, "BNC_tokens_with_last_layer_vectors.csv")
        with open(vector_file, mode="a") as outfile:
            writer = csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)

            """
            Run the token sentence through the model and calculate a word vector
            based on the mean of the WordPiece vectors in the last layer
            """
            tokenized_word = tokenizer.tokenize(word)

            # Add the special tokens.
            marked_text = "[CLS] " + text + " [SEP]"
            # Split the sentence into tokens.
            tokenized_text = tokenizer.tokenize(marked_text)
            segments_ids = [1] * len(tokenized_text)

            # Map the token strings to their vocabulary indeces.
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

            # Display the words with their indeces.
            #for tup in zip(tokenized_text, indexed_tokens):
                #print('{:<12} {:>6,}'.format(tup[0], tup[1]))

            # Mark each of the tokens as belonging to sentence "1".
            segments_ids = [1] * len(tokenized_text)

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])

            try:
                # Predict hidden states features for each layer
                with torch.no_grad():
                    encoded_layers, _ = model(tokens_tensor, segments_tensors)
            except:
                print("tokenized sequence too long")
                print(tokenized_text)

            # Rearrange hidden layers to be grouped by token
            # Concatenate the tensors for all layers. We use `stack` here to
            # create a new dimension in the tensor.
            token_embeddings = torch.stack(encoded_layers, dim=0)
            token_embeddings.size()

            # Remove dimension 1, the "batches".
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            token_embeddings.size()


            # Swap dimensions 0 and 1.
            token_embeddings = token_embeddings.permute(1,0,2)
            token_embeddings.size()


            ####
            vectors = []
            for word_piece in tokenized_word:
                # TODO should be the matching slice, because this doesnt account for repeat word  pieces
                index = tokenized_text.index(word_piece)
                token = token_embeddings[index]
                # `token` is a [12 x 768] tensor

                # Sum the vectors from the last four layers.
                #sum_vec = torch.sum(token[-4:], dim=0)

                # Use the vectors from the last layer
                vec = token[-1:]

                vectors.append(vec.numpy())

            #vectors2 = torch.tensor(vectors)

            # use the mean of all of the word_pieces. 
            word_vector = np.average(vectors, axis=0)
            
            
            # finally, write all of the info with the vector to disk
            writer.writerow([word, text, pos, uid, word_vector])


### Calculate centroid for each wor

In [None]:
# we didnt collect tokens for uppercase words by mistake. so, dont try to calculate for uppercase words
lowercase_unique_words = filter(lambda x: x[0].islower(), unique_words)


for word in lowercase_unique_words:
    data_dir = './data/similarity_dataset'
    word_dir = os.path.join(data_dir, word)
    

    
    path = os.path.join(word_dir, 'BNC_tokens_with_last_layer_vectors.csv')
    if os.path.isfile(path):
    
        with open(path, mode='r') as csv_file:
            reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=["word", "sentence", "tag", "uid", "vector"])


            # calculate centroid
            word = word

            first = next(reader)["vector"][2:-2]
            initial_vec = np.fromstring(first, dtype=np.float, sep=' ')
            centroid = initial_vec

            for row in reader:
                # chop off the weird brackets at beginning and end of string
                vector = np.fromstring(row["vector"][2:-2], dtype=np.float, sep=' ')

                centroid = ( centroid + vector ) / 2

            outpath = os.path.join(word_dir, "BNC_layer_11_centroid.csv")
            with open(outpath, mode='w') as disk:
                writer = csv.DictWriter(disk, delimiter='\t', fieldnames=["word", "centroid"])
                data = {"word": word, "centroid": centroid}
                writer.writerow(data)
            
    else:
        # guessss this word wasn't in the bnc for some reason?
        print("no file for %s" % word)

        
        

### Evaluate Centroids on 353 similarity

In [None]:
from scipy.spatial.distance import cosine


# create a list to store predicted and observed similarities
similarities = []

# open wordsim 353
ws353

# for each word pair, calculate the cosine similarity between the centroids for the words
for entry in ws353:
    
    word1 = entry['word1'].lower()
    word2 = entry['word2'].lower()
    observed_similarity = entry['similarity']

    # retrieve centroid for both words
    data_dir = './data/similarity_dataset'
    word1_dir = os.path.join(data_dir, word1)
    word2_dir = os.path.join(data_dir, word2)
    
    if os.path.isdir(word1_dir) and os.path.isdir(word2_dir):

        word1_centroid = []
        word2_centroid = []



        path = os.path.join(word1_dir, 'BNC_layer_11_centroid.csv')
        with open(path, mode='r') as infile:
            reader = csv.DictReader(infile, delimiter='\t', fieldnames=["word", "vector"])
            row = next(reader)
            word1_centroid = np.fromstring(row["vector"][2:-2], dtype=np.float, sep=' ')

        path = os.path.join(word2_dir, 'BNC_layer_11_centroid.csv')
        with open(path, mode='r') as infile:
            reader = csv.DictReader(infile, delimiter='\t', fieldnames=["word", "vector"])
            row = next(reader)
            word2_centroid = np.fromstring(row["vector"][2:-2], dtype=np.float, sep=' ')  

        # calculate predicted similarity from centroids
        predicted_similarity = 1- cosine(word1_centroid, word2_centroid)

        # store the predicted and observed similarity in a dict of {word: predicted: observed:}
        data = {"word1": word1, "word2": word2, "predicted": predicted_similarity, "observed": observed_similarity}
        similarities.append(data)
    
    else:
        print("no data collected for %s or %s" % (word1, word2))

# calculate pearsons correlation coefficient. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr


df = pd.DataFrame.from_records(similarities)
print(df)

X = df["predicted"]
y = df["observed"]

pearson_value = pearsonr(X,y)

print("Pearson's correlation coefficient between observed similarity and centroid cosine similarity is {}".format(pearson_value))
print('N = {}'.format(len(df)))

### Do the same for SimLex Similarity

In [None]:
from scipy.spatial.distance import cosine


# create a list to store predicted and observed similarities
similarities = []

# open simlex 999
simlex999

# for each word pair, calculate the cosine similarity between the centroids for the words
for entry in simlex999:
    
    word1 = entry['word1'].lower()
    word2 = entry['word2'].lower()
    observed_similarity = entry['SimLex999']

    # retrieve centroid for both words
    data_dir = './data/similarity_dataset'
    word1_dir = os.path.join(data_dir, word1)
    word2_dir = os.path.join(data_dir, word2)
    
    if os.path.isdir(word1_dir) and os.path.isdir(word2_dir):

        word1_centroid = []
        word2_centroid = []



        path = os.path.join(word1_dir, 'BNC_layer_11_centroid.csv')
        with open(path, mode='r') as infile:
            reader = csv.DictReader(infile, delimiter='\t', fieldnames=["word", "vector"])
            row = next(reader)
            word1_centroid = np.fromstring(row["vector"][2:-2], dtype=np.float, sep=' ')

        path = os.path.join(word2_dir, 'BNC_layer_11_centroid.csv')
        with open(path, mode='r') as infile:
            reader = csv.DictReader(infile, delimiter='\t', fieldnames=["word", "vector"])
            row = next(reader)
            word2_centroid = np.fromstring(row["vector"][2:-2], dtype=np.float, sep=' ')  

        # calculate predicted similarity from centroids
        predicted_similarity = 1- cosine(word1_centroid, word2_centroid)

        # store the predicted and observed similarity in a dict of {word: predicted: observed:}
        data = {"word1": word1, "word2": word2, "predicted": predicted_similarity, "observed": observed_similarity}
        similarities.append(data)
    
    else:
        print("no data collected for %s or %s" % (word1, word2))

# calculate pearsons correlation coefficient. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr


df = pd.DataFrame.from_records(similarities)
print(df)

X = df["predicted"]
y = df["observed"]

pearson_value = pearsonr(X,y)

print("Pearson's correlation coefficient between observed similarity and centroid cosine similarity is {}".format(pearson_value))
print('N = {}'.format(len(df)))

### Calculate K-means clusters for a single word, and print them out

In [None]:



from sklearn.cluster import KMeans

context_vectors = []
with open('data/similarity_dataset/mountain/BNC_tokens_with_last_layer_vectors.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=["word", "sentence", "tag", "uid", "vector"])

    data = [row for row in csv_reader]
    
    # get just the numbers without the array brackets [[ ]] on either side
    context_vector_strings = [row["vector"][2:-2] for row in data]
    context_vectors = [np.fromstring(x[2:-2], dtype=np.float, sep=' ') for x in context_vector_strings]


numclusters = 4
kmeans_obj = KMeans(n_clusters=numclusters)
kmeans_obj.fit(context_vectors)
label_list = kmeans_obj.labels_
cluster_centroids = kmeans_obj.cluster_centers_

print(label_list)
print(len(label_list))
print(cluster_centroids)

# Let's print the sentences that got clustered together.
for clusternumber in range(numclusters):
    print("\n\n")
    print("Sentences in cluster", clusternumber)
    for index, datapoint in enumerate(data):
        if label_list[index] == clusternumber:
            print(datapoint["sentence"])
            print("\n")

    


### Calculate k=4 and store clusters for all the words we have. 

In [None]:
from sklearn.cluster import KMeans



# we didnt collect tokens for uppercase words by mistake. so, dont try to calculate for uppercase words
lowercase_unique_words = filter(lambda x: x[0].islower(), unique_words)


for word in lowercase_unique_words:
    data_dir = './data/similarity_dataset'
    word_dir = os.path.join(data_dir, word)
    
    
    path = os.path.join(word_dir, 'BNC_tokens_with_last_layer_vectors.csv')
    if os.path.isfile(path):
    
        with open(path, mode='r') as csv_file:
            reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=["word", "sentence", "tag", "uid", "vector"])



            data = [row for row in reader]
            if len(data) >= 4:
                
                #context_vectors = []

                
                # fetch input vectors
                # get just the numbers without the array brackets [[ ]] on either side
                context_vector_strings = [row["vector"][2:-2] for row in data]
                context_vectors = [np.fromstring(x[2:-2], dtype=np.float, sep=' ') for x in context_vector_strings]


                # calculate kmeans clusters
                numclusters = 4
                kmeans_obj = KMeans(n_clusters=numclusters)
                kmeans_obj.fit(context_vectors)
                label_list = kmeans_obj.labels_
                cluster_centroids = kmeans_obj.cluster_centers_


                # store clusternumber with data
                for index,datapoint in enumerate(data):
                    datapoint['cluster_number'] = label_list[index]


                # save clusters to file
                outpath = os.path.join(word_dir, "BNC_layer_11_kmeans_4_clusters.csv")
                with open(outpath, mode='w') as disk:
                    writer = csv.DictWriter(disk, delimiter='\t', fieldnames=['word', 'clusternumber', 'centroid', 'sentence_uids'])


                    # retrieve centroid for each cluster and uids of sentences in cluster:
                    for clusternumber in range(numclusters):
                        sentence_uids = []
                        for index, datapoint in enumerate(data):
                            if datapoint['cluster_number'] == clusternumber:
                                sentence_uids.append(datapoint['uid'])
                        out_data = {'word': word,
                                    'clusternumber': clusternumber,
                                    'centroid': cluster_centroids[clusternumber],
                                    'sentence_uids': sentence_uids}

                        # write dta for this cluster
                        writer.writerow(out_data)
            else:
                print("not enough tokens to make four clusters for word: %s" % word)

    else:
        # guessss this word wasn't in the bnc for some reason?
        print("no file for %s" % word)

        
        

### Calculate ws353 similarity score for sense clusters using maxsim

In [None]:
from scipy.spatial.distance import cosine

# load ws353 word pairs
ws353 = load_ws353()

# create a list to store predicted and observed similarities
similarities = []


# for each word pair, calculate the cosine similarity between the centroids for the words
for entry in ws353:
    
    word1 = entry['word1'].lower()
    word2 = entry['word2'].lower()
    observed_similarity = entry['similarity']

    # retrieve centroids for both words
    data_dir = './data/similarity_dataset'
    word1_path = os.path.join(data_dir, word1, 'BNC_layer_11_kmeans_4_clusters.csv')
    word2_path = os.path.join(data_dir, word2, 'BNC_layer_11_kmeans_4_clusters.csv')
    
    if os.path.isfile(word1_path) and os.path.isfile(word2_path):

        word1_centroids = []
        word2_centroids = []



        with open(word1_path, mode='r') as infile:
            reader = csv.DictReader(infile, delimiter='\t', fieldnames=['word', 'clusternumber', 'centroid', 'sentence_uids'])
            data = [row for row in reader]
            word1_centroids = []
            for row in data:
                centroid = np.fromstring(row['centroid'][2:-2], dtype=np.float, sep=' ')
                word1_centroids.append(centroid)


        with open(word2_path, mode='r') as infile:
            reader = csv.DictReader(infile, delimiter='\t', fieldnames=['word', 'clusternumber', 'centroid', 'sentence_uids'])
            data = [row for row in reader]
            word2_centroids = []
            for row in data:
                centroid = np.fromstring(row['centroid'][2:-2], dtype=np.float, sep=' ')
                word2_centroids.append(centroid)
                

        # calculate predicted similarity from of each pair of cluster centroids of both words
        predicted_similarities = []
        for centroid1 in word1_centroids:
            for centroid2 in word2_centroids:
                predicted_similarity = 1 - cosine(centroid1, centroid2)
                predicted_similarities.append(predicted_similarity)
            
            
        # find the max of the pairwise similarities
        max_sim = max(predicted_similarities)

        # store the predicted and observed similarity in a dict of {word: predicted: observed:}
        data = {"word1": word1, "word2": word2, "predicted": max_sim, "observed": observed_similarity}
        similarities.append(data)
    
    else:
        print("no data collected for %s or %s" % (word1, word2))

# calculate pearsons correlation coefficient. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import spearmanr


df = pd.DataFrame.from_records(similarities)
print(df)

X = df["predicted"]
y = df["observed"]

pearson_value = pearsonr(X,y)

print("Pearson's correlation coefficient between observed similarity and k=4 maxsim sense cluster similarity is {}".format(pearson_value))
print('N = {}'.format(len(df)))


spearman_value = spearmanr(X,y)

print("Spearmans's correlation coefficient between observed similarity and k=4 maxsim sense cluster similarity is {}".format(spearman_value))
print('N = {}'.format(len(df)))

### Calculate SimLex999 similarity score for sense clusters using maxsim

In [None]:
from scipy.spatial.distance import cosine

# load ws353 word pairs
simlex999 = load_simlex999()

# create a list to store predicted and observed similarities
similarities = []


# for each word pair, calculate the cosine similarity between the centroids for the words
for entry in simlex999:
    
    word1 = entry['word1'].lower()
    word2 = entry['word2'].lower()
    observed_similarity = entry['SimLex999']

    # retrieve centroids for both words
    data_dir = './data/similarity_dataset'
    word1_path = os.path.join(data_dir, word1, 'BNC_layer_11_kmeans_4_clusters.csv')
    word2_path = os.path.join(data_dir, word2, 'BNC_layer_11_kmeans_4_clusters.csv')
    
    if os.path.isfile(word1_path) and os.path.isfile(word2_path):

        word1_centroids = []
        word2_centroids = []


        with open(word1_path, mode='r') as infile:
            reader = csv.DictReader(infile, delimiter='\t', fieldnames=['word', 'clusternumber', 'centroid', 'sentence_uids'])
            data = [row for row in reader]
            word1_centroids = []
            for row in data:
                centroid = np.fromstring(row['centroid'][2:-2], dtype=np.float, sep=' ')
                word1_centroids.append(centroid)


        with open(word2_path, mode='r') as infile:
            reader = csv.DictReader(infile, delimiter='\t', fieldnames=['word', 'clusternumber', 'centroid', 'sentence_uids'])
            data = [row for row in reader]
            word2_centroids = []
            for row in data:
                centroid = np.fromstring(row['centroid'][2:-2], dtype=np.float, sep=' ')
                word2_centroids.append(centroid)
                

        # calculate predicted similarity from of each pair of cluster centroids of both words
        predicted_similarities = []
        for centroid1 in word1_centroids:
            for centroid2 in word2_centroids:
                predicted_similarity = 1 - cosine(centroid1, centroid2)
                predicted_similarities.append(predicted_similarity)
            
            
        # find the max of the pairwise similarities
        max_sim = max(predicted_similarities)

        # store the predicted and observed similarity in a dict of {word: predicted: observed:}
        data = {"word1": word1, "word2": word2, "predicted": max_sim, "observed": observed_similarity}
        similarities.append(data)
    
    else:
        print("no data collected for %s or %s" % (word1, word2))

# calculate pearsons correlation coefficient. 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import spearmanr


df = pd.DataFrame.from_records(similarities)
print(df)

X = df["predicted"]
y = df["observed"]

pearson_value = pearsonr(X,y)

print("Pearson's correlation coefficient between observed similarity and k=4 maxsim sense cluster similarity is {}".format(pearson_value))
print('N = {}'.format(len(df)))

spearman_value = spearmanr(X,y)

print("Spearmans's correlation coefficient between observed similarity and k=4 maxsim sense cluster similarity is {}".format(spearman_value))
print('N = {}'.format(len(df)))