In [1]:
from helpers import bert_helper, datasets, grinders, helpers

import os, shutil
import numpy as np
import csv
import pickle
import pandas as pd
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [27]:
# load datasets

"""
1) the datatsets
"""
men = datasets.get_men()
verbsim = datasets.get_verbsim()
ws353 = datasets.get_ws353()
ws353_rel = datasets.get_ws353_rel()
ws353_sim = datasets.get_ws353_sim()
simlex = datasets.get_simlex999()
simverb_3500 = datasets.get_simverb3500()


datasets = {'WordSim353_sim': ws353_sim, 
            'WordSim353_rel': ws353_rel,  
            'WordSim353': ws353,
            'SimLex999': simlex, 
            'YP-130': verbsim, 
            'MEN': men,
            'SimVerb3500': simverb_3500 
             }



AttributeError: 'dict' object has no attribute 'get_men'

In [19]:
# fetch the number of words collected for each dataset, and the words (and comparisons) that had to be omitted

def unique_words_in_dataset(data):
    all_words = []
    for row in data:
        w1 = row['word1']
        w2 = row['word2']
        all_words.append(w1)
        all_words.append(w2)
    unique = set(all_words)
    return unique

def words_not_collected_for(dataset):
    not_collected = []
    for word in dataset:
        tokens = grinders.read_tokens_for(word)
        if (tokens is None) or len(tokens) == 0 :
            not_collected.append(word)
    return(not_collected)


def words_with_fewer_than_x_tokens_for(dataset, x):
    words_with_not_enough_tokens = []
    for word in dataset:
        tokens = grinders.read_tokens_for(word)
        if (tokens is None) or (len(tokens) < x) :
            words_with_not_enough_tokens.append(word)
    return(words_with_not_enough_tokens)   

def print_words(words):
    for word in words:
        print("\t%s" % word)
        
        
def comparisons_not_performed_for(dataset, words_with_fewer_than_x_tokens):
    scores_not_calculated = []
    for row in dataset:
        if comparison_contains_one_of(words_with_fewer_than_x_tokens, row):
            scores_not_calculated.append(row)
    return scores_not_calculated

def word_pairs(data):
    pairs = []
    for row in data:
        w1 = row["word1"]
        w2 = row["word2"]
        pair = w1+"-"+w2
        pairs.append(pair)
    return pairs

def comparison_contains_one_of(words, comparison):
    # comparison is a row of one of the datasets
    if comparison['word1'] in words_with_fewer_than_x_tokens:
        return True
    elif comparison['word2'] in words_with_fewer_than_x_tokens:
        return True
    else:
        return False


cluster_sizes = [1,2,3,4,5,6,7,8,9,10,50]

for dataset_name, dataset in datasets.items():
    print(dataset_name)
    print("number of comparisons in %s: %s" % (dataset_name, len(dataset))) 

        
    unique_words = unique_words_in_dataset(dataset)
    print("number of unique words in %s: %s" % (dataset_name, len(unique_words)))
    
    # see how many of the words we have data for
#     words_not_collected = words_not_collected_for(unique_words)
#     print("number of words not collected: %s" % len(words_not_collected))
#     print("list of words not collected: ")
#     print_words(words_not_collected)
        
    for cluster_size in cluster_sizes:
        
        # did we collect enough tokens to make clusters for all words
        words_with_fewer_than_x_tokens = words_with_fewer_than_x_tokens_for(unique_words, cluster_size)
        

        
        if len(words_with_fewer_than_x_tokens) > 0:
        
            print("%s clusters" % cluster_size)

            num_words_with_x_tokens = len(unique_words) - len(words_with_fewer_than_x_tokens)
            print("# words with enough tokens: %s" % num_words_with_x_tokens)
            print("not enough tokens to make %s clusters for %s words:" % (cluster_size, len(words_with_fewer_than_x_tokens)))
            print_words(words_with_fewer_than_x_tokens)
                  
        comparisons_we_couldnt_perform = comparisons_not_performed_for(dataset, words_with_fewer_than_x_tokens)
        if len(comparisons_we_couldnt_perform) > 0:
            print("not enough tokens to calculate %s cluster scores for %s pairs:" % (cluster_size, len(words_with_fewer_than_x_tokens)))
            for comparison in comparisons_we_couldnt_perform:
                  w1 = comparison['word1']
                  w2 = comparison['word2']
                  print("%s - %s" % (w1,w2))
            print()
    print()
    
    

ws_353_sim
number of comparisons in ws_353_sim: 203
number of unique words in ws_353_sim: 277
8 clusters
# words with enough tokens: 276
not enough tokens to make 8 clusters for 1 words:
	aluminum
not enough tokens to calculate 8 cluster scores for 1 pairs:
aluminum - metal

9 clusters
# words with enough tokens: 275
not enough tokens to make 9 clusters for 2 words:
	aluminum
	kilometer
not enough tokens to calculate 9 cluster scores for 2 pairs:
mile - kilometer
aluminum - metal

10 clusters
# words with enough tokens: 275
not enough tokens to make 10 clusters for 2 words:
	aluminum
	kilometer
not enough tokens to calculate 10 cluster scores for 2 pairs:
mile - kilometer
aluminum - metal

50 clusters
# words with enough tokens: 270
not enough tokens to make 50 clusters for 7 words:
	madhouse
	aluminum
	kilometer
	theater
	rooster
	carnivore
	artifact
not enough tokens to calculate 50 cluster scores for 7 pairs:
asylum - madhouse
food - rooster
tiger - carnivore
cup - artifact
mile - k

5 clusters
# words with enough tokens: 749
not enough tokens to make 5 clusters for 2 words:
	donut
	ipod
not enough tokens to calculate 5 cluster scores for 2 pairs:
cafe - donut
chair - ipod
ipod - rope
donut - panda

6 clusters
# words with enough tokens: 749
not enough tokens to make 6 clusters for 2 words:
	donut
	ipod
not enough tokens to calculate 6 cluster scores for 2 pairs:
cafe - donut
chair - ipod
ipod - rope
donut - panda

7 clusters
# words with enough tokens: 748
not enough tokens to make 7 clusters for 3 words:
	donut
	colorful
	ipod
not enough tokens to calculate 7 cluster scores for 3 pairs:
colorful - outfit
cafe - donut
colorful - toy
colorful - frame
colorful - duck
colorful - wood
colorful - lab
chair - ipod
ipod - rope
donut - panda

8 clusters
# words with enough tokens: 748
not enough tokens to make 8 clusters for 3 words:
	donut
	colorful
	ipod
not enough tokens to calculate 8 cluster scores for 3 pairs:
colorful - outfit
cafe - donut
colorful - toy
colorful -

In [31]:
# percentage of comparisons made

results_file = '../data/uncollected_pairs.csv'
with open(results_file, mode='w') as disk:
    writer = csv.writer(disk)

    for dataset_name, dataset in datasets.items():
        writer.writerow([dataset_name])
        
        print(dataset_name)
        unique_words = unique_words_in_dataset(dataset)

        words_seen = []
        pairs_seen = []

        for cluster_size in cluster_sizes:
            words_with_fewer_than_x_tokens = words_with_fewer_than_x_tokens_for(unique_words, cluster_size)
            comparisons_we_couldnt_perform = comparisons_not_performed_for(dataset, words_with_fewer_than_x_tokens)



            total_comparisons = len(dataset)
            performed_comparisons = total_comparisons - len(comparisons_we_couldnt_perform)
            percentage_performed = (performed_comparisons / total_comparisons) * 100
            rounded_percentage = round(percentage_performed, 2)

            #print("Clusters: %s\tPercentage of Comparisons Performed: %s" %(cluster_size, rounded_percentage))
            if percentage_performed < 100 and cluster_size < 50:
                pairs = word_pairs(comparisons_we_couldnt_perform)
                word_list = "{" + ", ".join(words_with_fewer_than_x_tokens) + "}"
                pair_list = "{" + ", ".join(pairs) + "}"
                res = "%s\t%s\t%s\t%s" % (cluster_size, rounded_percentage, word_list, pair_list)
                print(res)
                writer.writerow([cluster_size, rounded_percentage, word_list, pair_list])

ws_353_sim
8	99.51	{aluminum}	{aluminum-metal}
9	99.01	{aluminum, kilometer}	{mile-kilometer, aluminum-metal}
10	99.01	{aluminum, kilometer}	{mile-kilometer, aluminum-metal}
ws353_rel
9	99.6	{kilometer}	{territory-kilometer}
10	99.6	{kilometer}	{territory-kilometer}
ws353
8	99.72	{aluminum}	{aluminum-metal}
9	99.15	{kilometer, aluminum}	{mile-kilometer, territory-kilometer, aluminum-metal}
10	99.15	{kilometer, aluminum}	{mile-kilometer, territory-kilometer, aluminum-metal}
simlex
4	99.8	{orthodontist}	{orthodontist-dentist, doctor-orthodontist}
5	99.7	{disorganize, orthodontist}	{orthodontist-dentist, doctor-orthodontist, disorganize-organize}
6	99.7	{disorganize, orthodontist}	{orthodontist-dentist, doctor-orthodontist, disorganize-organize}
7	99.7	{disorganize, orthodontist}	{orthodontist-dentist, doctor-orthodontist, disorganize-organize}
8	99.5	{aluminum, disorganize, orthodontist}	{metal-aluminum, tin-aluminum, orthodontist-dentist, doctor-orthodontist, disorganize-organize}
9	99.