### Now we will evaluate each combination of layer and num_clusters against similarity gold standards

In [8]:
"""
In the end we want two data structures that looks like this:

layer k_clusters ws353_pearson p ws353_spearman p ws353_n simlex_pearson p  simlex_spearman p  simlex_n
0     1          .77             .73              200     .54              .49                 988
....  ....       ....
0     7          .88             .80              180     .54              .65                 950
1     1          ....
...   ....       ....
11    7          ....



"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.spatial.distance import cosine
import datasets
import csv
import os, shutil



def run_analysis(dataset, similarity_metric, cluster_sizes, layers):
    results_file = './data/bnc_cluster_analysis_' + dataset[0] + '_' + similarity_metric + '_similarity_results.csv'
    fieldnames = ['layer', 'k_clusters', 'pearson', 'pearson_P', 'spearman', 'spearman_P', 'N']
    with open(results_file, mode='w') as disk:
        writer = csv.DictWriter(disk, delimiter='\t', fieldnames=fieldnames)


        for layer_number in layers:
            for k in cluster_sizes:

#                 # what is the number we are comparing called?
#                 if dataset == ws353_rel:
#                     number = 'relatedness'
#                 elif dataset == verbsim:
#                     number = 'similarity'
#                 elif dataset == men:
#                     number = 'relatedness'
                
                data = dataset[1]

                # calc sim for all the word pairs
                expected_similarities = []
                for row in data:
                    word1 = row['word1']
                    word2 = row['word2']
                    observed_similarity = row['relatedness']

                    # get centroid data for these words at this layer and this k size
                    pairwise_centroids = {}
                    for word in [word1, word2]:
                        cluster_filename = "layer_" + str(layer_number) + "_clusters_k_equals_" + str(k) + ".csv"
                        cluster_path = os.path.join('./data/word_data/', word, 'analysis_results', cluster_filename)
                        
                        try:
                            with open(cluster_path, mode='r') as csv_file:
                                fieldnames = ['word', 'clusternumber', 'centroid', 'sentence_uids']
                                reader = csv.DictReader(csv_file, delimiter='\t', fieldnames=fieldnames)

                                word_centroids = []
                                for line in reader:
                                    centroid = np.fromstring(line['centroid'][2:-2], dtype=np.float, sep=' ')
                                    word_centroids.append(centroid)
                                pairwise_centroids[word] = word_centroids
                        except:
                            print("can't calculate predicted similarity for pair %s, %s" %(word1, word2))
                            print("   no tokens collected for %s" % word)



                    # calculate similarity with both metrics
                    # calculate predicted similarity from of each pair of cluster centroids of both words
                    if (word1 in pairwise_centroids) and (word2 in pairwise_centroids):
                        predicted_similarities = []
                        for centroid1 in pairwise_centroids[word1]:
                            for centroid2 in pairwise_centroids[word2]:
                                predicted_similarity = 1 - cosine(centroid1, centroid2)
                                predicted_similarities.append(predicted_similarity)
                        # find the max of the pairwise similarities
                        if similarity_metric == 'max_sim':
                            sim = max(predicted_similarities)
                        elif similarity_metric == 'avg_sim':
                            sim = np.sum(predicted_similarities) / k*k

                        row['predicted_similarity'] = sim
                    else:
                        row['predicted_similarity'] = None

            
                # remove pairs from consideration for which we have no predicted similarity to compare
                data = list(filter(lambda row: row['predicted_similarity'] != None, data))                        

                # create data frame 
                df = pd.DataFrame.from_records(data)
                X = df['predicted_similarity']
                y = df['relatedness']

                # run pearson expected vs observed
                pearson_value = pearsonr(X,y)

                # run spearman expected vs observed
                spearman_value = spearmanr(X,y)


                # save results to file
                output = {'layer': layer_number,
                          'k_clusters': k,
                          'pearson': pearson_value[0],
                          'pearson_P': pearson_value[1],
                          'spearman': spearman_value[0],
                          'spearman_P': spearman_value[1],
                          'N': len(df)
                         }
                writer.writerow(output)



"""
1) The datasets we want to analyze
"""
men = datasets.get_men()
verbsim = datasets.get_verbsim()
ws353_rel = datasets.get_ws353_rel()

datasets = [('men', men), ('verbsim', verbsim), ('ws353_rel', ws353_rel)]


"""
2) the layers we want to analzye
"""
layers = [0,1,5,11]

"""
3) The cluster sizes we want to analyze
"""
cluster_sizes = [1,3,5,7]

for dataset in datasets:
    for similarity_metric in ['max_sim', 'avg_sim']:
        run_analysis(dataset, similarity_metric, cluster_sizes, layers)


processed 3000 word pairs from MEN relatedness dataset
processed 130 word pairs from VerbSim dataset
processed 252 word pairs from WordSim relatedness dataset
can't calculate predicted similarity for pair chair, ipod
   no tokens collected for ipod
can't calculate predicted similarity for pair ipod, rope
   no tokens collected for ipod
can't calculate predicted similarity for pair chair, ipod
   no tokens collected for ipod
can't calculate predicted similarity for pair ipod, rope
   no tokens collected for ipod
can't calculate predicted similarity for pair cafe, donut
   no tokens collected for donut
can't calculate predicted similarity for pair chair, ipod
   no tokens collected for ipod
can't calculate predicted similarity for pair ipod, rope
   no tokens collected for ipod
can't calculate predicted similarity for pair donut, panda
   no tokens collected for donut
can't calculate predicted similarity for pair colorful, outfit
   no tokens collected for colorful
can't calculate predic

can't calculate predicted similarity for pair chair, ipod
   no tokens collected for ipod
can't calculate predicted similarity for pair ipod, rope
   no tokens collected for ipod
can't calculate predicted similarity for pair donut, panda
   no tokens collected for donut
can't calculate predicted similarity for pair chair, ipod
   no tokens collected for ipod
can't calculate predicted similarity for pair ipod, rope
   no tokens collected for ipod
can't calculate predicted similarity for pair chair, ipod
   no tokens collected for ipod
can't calculate predicted similarity for pair ipod, rope
   no tokens collected for ipod
can't calculate predicted similarity for pair cafe, donut
   no tokens collected for donut
can't calculate predicted similarity for pair chair, ipod
   no tokens collected for ipod
can't calculate predicted similarity for pair ipod, rope
   no tokens collected for ipod
can't calculate predicted similarity for pair donut, panda
   no tokens collected for donut
can't cal

can't calculate predicted similarity for pair distribute, commercialize
   no tokens collected for commercialize
can't calculate predicted similarity for pair divide, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair express, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair list, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair solve, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair distribute, commercialize
   no tokens collected for commercialize
can't calculate predicted similarity for pair divide, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair express, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair list, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair solve, fig

can't calculate predicted similarity for pair solve, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair distribute, commercialize
   no tokens collected for commercialize
can't calculate predicted similarity for pair divide, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair express, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair list, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair solve, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair divide, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair express, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair list, figure out
   no tokens collected for figure out
can't calculate predicted similarity for pair solve, figure out
   