# Analyze stratified sampling


## Aspects considered

* wikipedia frequency
* Concreteness (mrc)
* familiarity (mrc)
* age of acquisition (mrc)
* polysemy
* similarity to centroid --> in sampling script


In [75]:
# Original bin function (use it for cosine similarity to centroid)
import numpy as np
import math
from collections import defaultdict
import json
import random

def bins_to_dict(name, frequencies, bin_intervals, mapping, restriction, bin_type):

    bin_dict = dict()
    bin_dict[name] =  {
    'type' : bin_type,
    'mapping' : mapping,
    'bins' : [],
    'frequencies' : [int(f) for f in list(frequencies)],
    'restriction' : restriction
    }


    for n, i in enumerate(bin_intervals):
        if n != len(bin_intervals) - 1:
            bin_dict[name]['bins'].append((i, bin_intervals[n+1]))
        else:
            break
    return bin_dict

def get_bins_from_distribution(concept_dict_list, name, n_bins, \
                            mapping = False, restriction = None):

    if mapping == False:
        if restriction == None:
            values = [float(d[name]) for d in concept_dict_list if d[name] != '']
        else:
            print('restricting data')
            values = [float(d[name]) for d in concept_dict_list if \
                    (d[name] != '') and (float(d[name]) != restriction)]
    elif mapping == 'log':
        print('taking the log')
        if restriction == None:
            values = [math.log(float(d[name])) for d in concept_dict_list if d[name] != '']
        else:
            print('taking log and restricting data')
            values = [math.log(float(d[name])) for d in concept_dict_list if \
                    (d[feature] != '') and (float(d[name]) != restriction)]

    frequencies, bin_intervals = np.histogram(values, bins = n_bins)
    print(bin_intervals)
    print(frequencies)
    bin_dict = bins_to_dict(name, frequencies, bin_intervals, mapping,\
                            restriction, 'distribution')
    #plt.hist(values, bins = n_bins)
    #plt.gca().set(title='Frequency Histogram', ylabel='frequency', xlabel=feature)
    #plt.show()
    return bin_dict


def assign_to_bin(concept_dict, bin_dict, name):

    #get_polysemy_info(concept_dict)

    if name == 'polysemy':
        concept_value = get_polysemy_info(concept_dict)
        bin_assigned = concept_value
    else:
        if concept_dict[name] != '':
            concept_value = float(concept_dict[name])
            if bin_dict[name]['mapping'] == 'log':
                concept_value = math.log(concept_value)
            for n, interval in enumerate(bin_dict[name]['bins']):
                start, end = interval
                if start <= concept_value < end:
                    bin_assigned = n
                    break
                else:
                    bin_assigned = None
        else:
            bin_assigned = None
    return bin_assigned


def get_polysemy_info(concept_dict):

    word = concept_dict['word']
    mipvu_met = concept_dict['mipvu']
    polysemy_type = concept_dict['polysemy_type']

    if polysemy_type == 'mon':
        poly = 'mon'
    elif polysemy_type == 'homonyms_also_same_pos':
        poly = 'homonym'
    elif mipvu_met == 'True':
        poly = 'met'
    # Possibly metonymy if not metaphor and not homonym
    # caveat: the metaphor annotations are not exhaustive
    elif polysemy_type == 'poly':
        poly = 'poly_metonymy'
    else:
        poly = None
    return poly

def get_bin_distribution(bin_dict, concept_dicts, concept_info_dict, sampling_name):
    bin_concept_dict = defaultdict(list)
    #if sampling_name == 'polysemy':
     #   sampling_name = sampling_name

    for d in concept_dicts:
        concept = d['lemma']
        info = concept_info_dict[concept]
        bin_assigned = assign_to_bin(info, bin_dict, sampling_name)
        bin_concept_dict[bin_assigned].append(concept)

    return bin_concept_dict


def draw_random_sample_from_list(concept_dict_list):

    # chose a random integer (lenth is out of list index)
    selected_int = random.randint(0, len(concept_dict_list)-1)
    # get data item from list and remove selected item from list
    selected_item = concept_dict_list.pop(selected_int)

    return selected_item, selected_int


def load_lexical_data():
    # this is what we originally sampled from
    path = '../../vocabulary_data/all_lodce_mrc.csv'
    
    with open(path) as infile:
        dicts = list(csv.DictReader(infile))
    word_info_dict = defaultdict(list)
    for d in dicts:
        word = d['word']
        word_info_dict[word].append(d)
    return word_info_dict

def get_concepts_set(p, coll):
    concept_info_dict = dict()

    p = 'used_in_cooking'
    col = props_collection_dict[p]
    path = f'../../data_lexical_info/concepts_additional_info/{col}/{p}.csv'

    with open(path) as infile:
        reader = csv.DictReader(infile)
        dicts = list(reader)

    for d in dicts:
        concept = d['lemma']
        concept_info_dict[concept] = d
    return concept_info_dict


In [45]:
# load general bins

with open('../vocabulary_data/bins.json') as infile:
    bin_dict_general = json.load(infile)
    
for k, v in bin_dict_general.items():
    print(k, v)

wiki_frequency {'type': 'distribution', 'mapping': 'log', 'bins': [[4.605170185988092, 9.300147501609214], [9.300147501609214, 13.995124817230336], [13.995124817230336, 18.690102132851457]], 'frequencies': [18678, 6823, 111], 'restriction': None}
conc {'type': 'distribution', 'mapping': False, 'bins': [[158.0, 328.66666666666663], [328.66666666666663, 499.3333333333333], [499.3333333333333, 670.0]], 'frequencies': [771, 1476, 1641], 'restriction': None}
fam {'type': 'distribution', 'mapping': False, 'bins': [[74.0, 268.33333333333337], [268.33333333333337, 462.6666666666667], [462.6666666666667, 657.0]], 'frequencies': [177, 1321, 2724], 'restriction': None}
aoa {'type': 'distribution', 'mapping': False, 'bins': [[125.0, 315.66666666666663], [315.66666666666663, 506.3333333333333], [506.3333333333333, 697.0]], 'frequencies': [416, 964, 488], 'restriction': None}
polysemy {'type': 'categories', 'mapping': None, 'bins': ['mon', 'met', 'poly_metonymy', 'homonym'], 'frequencies': [21844, 2

In [65]:
# load cosine bins of a property
import csv

props_collection_dict = {'used_in_cooking': 'complex', 'warm': 'perceptual'}



In [80]:
# check if words we do not have ratings for are 'weird'


# used_in_cooking
p = 'used_in_cooking'
coll = props_collection_dict[p]
set_info_dict = get_concepts_set(p, coll)
lexical_data_dict = load_lexical_data()

words_with_rating = set()
words_without_rating = set()

for w, info in set_info_dict.items():
    lexical_info = lexical_data_dict[w]
    fam = lexical_info[0]['fam']
    if fam != '':
        words_with_rating.add(w)
    else:
        words_without_rating.add(w)
        
print(words_with_rating)
print()
print(words_without_rating)

{'spade', 'cake', 'gravy', 'jack', 'sauce', 'pestle', 'cleaver', 'canary', 'file', 'tablespoon', 'hammer', 'dry', 'beef', 'tool', 'gavel', 'spatula', 'cabbage', 'shrimp', 'mustard', 'radish', 'knife', 'butter', 'garlic', 'lobster', 'slice', 'soup', 'bake', 'tap', 'carrot', 'plane', 'tomato', 'rice', 'spinach', 'bean', 'screwdriver', 'upset', 'pineapple', 'pie', 'see', 'corn', 'bill', 'cauliflower', 'cream', 'violin', 'eat', 'mallet', 'toaster', 'style', 'dough', 'axe', 'die', 'hatchet', 'buffer', 'square', 'pudding', 'lettuce', 'crank', 'pork', 'grasshopper', 'cook', 'round', 'lime', 'smooth', 'candy', 'cucumber', 'corkscrew', 'gang', 'spoon', 'salad', 'leek', 'ram', 'gutter', 'shovel', 'potato', 'mussel', 'cheese', 'awl', 'chisel', 'bass', 'flannel', 'bread', 'sparrow', 'punch', 'chicken', 'float', 'fork', 'noodle', 'beetle', 'rake', 'oven', 'pea', 'chop', 'vinegar', 'ham', 'hoe', 'apron', 'adze', 'mince', 'steak', 'bit', 'stew', 'pick', 'shear', 'meat', 'stove', 'pickle', 'broil', 'b

In [81]:
# used_in_cooking
p = 'warm'
coll = props_collection_dict[p]
set_info_dict = get_concepts_set(p, coll)
lexical_data_dict = load_lexical_data()

words_with_rating = set()
words_without_rating = set()

for w, info in set_info_dict.items():
    lexical_info = lexical_data_dict[w]
    fam = lexical_info[0]['fam']
    if fam != '':
        words_with_rating.add(w)
    else:
        words_without_rating.add(w)
        
print(words_with_rating)
print()
print(words_without_rating)

{'spade', 'cake', 'gravy', 'jack', 'sauce', 'pestle', 'cleaver', 'canary', 'file', 'tablespoon', 'hammer', 'dry', 'beef', 'tool', 'gavel', 'spatula', 'cabbage', 'shrimp', 'mustard', 'radish', 'knife', 'butter', 'garlic', 'lobster', 'slice', 'soup', 'bake', 'tap', 'carrot', 'plane', 'tomato', 'rice', 'spinach', 'bean', 'screwdriver', 'upset', 'pineapple', 'pie', 'see', 'corn', 'bill', 'cauliflower', 'cream', 'violin', 'eat', 'mallet', 'toaster', 'style', 'dough', 'axe', 'die', 'hatchet', 'buffer', 'square', 'pudding', 'lettuce', 'crank', 'pork', 'grasshopper', 'cook', 'round', 'lime', 'smooth', 'candy', 'cucumber', 'corkscrew', 'gang', 'spoon', 'salad', 'leek', 'ram', 'gutter', 'shovel', 'potato', 'mussel', 'cheese', 'awl', 'chisel', 'bass', 'flannel', 'bread', 'sparrow', 'punch', 'chicken', 'float', 'fork', 'noodle', 'beetle', 'rake', 'oven', 'pea', 'chop', 'vinegar', 'ham', 'hoe', 'apron', 'adze', 'mince', 'steak', 'bit', 'stew', 'pick', 'shear', 'meat', 'stove', 'pickle', 'broil', 'b

In [50]:
# load annotated data of a property



path = f'../data_pair_filtering/aggregated/experiment3/{p}.csv'

with open(path) as infile:
    concept_dicts_total = list(csv.DictReader(infile, delimiter = '\t'))

    
concept_dicts_include = [d for d in concept_dicts_total if d['decision'] == 'include']
concept_dicts_exclude = [d for d in concept_dicts_total\
                         if d['decision'].startswith('exclude')]

concepts_selected = set([d['lemma'] for d in concept_dicts_total])
total_concepts = set(concept_info_dict.keys())
concepts_not_selected = total_concepts.difference(concepts_selected)
concept_dicts_not_selected = [d for c, d in concept_info_dict.items()\
                              if c in concepts_not_selected]

# sanity check:
# should print empty set
print(concepts_selected.intersection(concepts_not_selected))
print(f'Concepts still available for sampling: {len(concept_dicts_not_selected)}')

set()
Concepts still available for sampling: 181


In [51]:
# get cosine similarity bins



name = 'cosine_centroid'
n_bins = 3
bin_dict_cosine = get_bins_from_distribution(dicts, name, n_bins, \
                                mapping = False, restriction = None)

[0.09478792 0.25340994 0.41203196 0.57065398]
[ 66 103 214]


In [82]:
# get bin per word
# cosine 


# inspect cosine bins
sampling_name = 'cosine_centroid' 
print(f'Distribution of {sampling_name}')
print()
print('Bins:')
for n, bin_interval in enumerate(bin_dict_cosine[sampling_name]['bins']):
    print(n, bin_interval)
print()


print(f'Total concepts:')
      
bin_concept_dict = get_bin_distribution(bin_dict_cosine,\
                                        concept_dicts_total, \
                                        concept_info_dict,\
                                        sampling_name)
       
for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    
print()
print(f'Concepts included:')     

bin_concept_dict = get_bin_distribution(bin_dict_cosine,\
                                        concept_dicts_include, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    

print()
print(f'Concepts excluded:')     

bin_concept_dict = get_bin_distribution(bin_dict_cosine,\
                                        concept_dicts_exclude, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    


Distribution of cosine_centroid

Bins:
0 (0.0947879187, 0.2534099397666667)
1 (0.2534099397666667, 0.4120319608333334)
2 (0.4120319608333334, 0.5706539819)

Total concepts:
0 31
1 67
2 81

Concepts included:
0 26
1 60
2 71

Concepts excluded:
0 5
1 7
2 10


In [95]:
# inspect fam
sampling_name = 'wiki_frequency' 

print(f'Distribution of {sampling_name}')
print()
print('Bins:')
for n, bin_interval in enumerate(bin_dict[sampling_name]['bins']):
    print(n, bin_interval)
print()


print(f'Total concepts:')
      
bin_concept_dict_total = get_bin_distribution(bin_dict,\
                                        concept_dicts_total, \
                                        concept_info_dict,\
                                        sampling_name)
       
for b, concepts in bin_concept_dict_total.items():
    print(b, len(concepts))
    
print()
print(f'Concepts included:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_include, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    


print()
print(f'Concepts excluded:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_exclude, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print('bin', b, ':', len(concepts), 'percentage of total in the bin:', round(len(concepts)/len(bin_concept_dict_total[b]), 2))

Distribution of wiki_frequency

Bins:
0 [4.605170185988092, 9.300147501609214]
1 [9.300147501609214, 13.995124817230336]
2 [13.995124817230336, 18.690102132851457]

Total concepts:
0 143
1 36

Concepts included:
0 128
1 29

Concepts excluded:
bin 1 : 7 percentage of total in the bin: 0.19
bin 0 : 15 percentage of total in the bin: 0.1


In [96]:
# inspect fam
sampling_name = 'fam' 

print(f'Distribution of {sampling_name}')
print()
print('Bins:')
for n, bin_interval in enumerate(bin_dict[sampling_name]['bins']):
    print(n, bin_interval)
print()


print(f'Total concepts:')
      
bin_concept_dict_total = get_bin_distribution(bin_dict,\
                                        concept_dicts_total, \
                                        concept_info_dict,\
                                        sampling_name)
       
for b, concepts in bin_concept_dict_total.items():
    print(b, len(concepts))
    
print()
print(f'Concepts included:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_include, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    


print()
print(f'Concepts excluded:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_exclude, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print('bin', b, ':', len(concepts), 'percentage of total in the bin:', round(len(concepts)/len(bin_concept_dict_total[b]), 2))

Distribution of fam

Bins:
0 [74.0, 268.33333333333337]
1 [268.33333333333337, 462.6666666666667]
2 [462.6666666666667, 657.0]

Total concepts:
2 56
1 12
None 109
0 2

Concepts included:
2 50
1 11
None 95
0 1

Concepts excluded:
bin 2 : 6 percentage of total in the bin: 0.11
bin 1 : 1 percentage of total in the bin: 0.08
bin None : 14 percentage of total in the bin: 0.13
bin 0 : 1 percentage of total in the bin: 0.5


In [97]:
# inspect fam
sampling_name = 'aoa' 

print(f'Distribution of {sampling_name}')
print()
print('Bins:')
for n, bin_interval in enumerate(bin_dict[sampling_name]['bins']):
    print(n, bin_interval)
print()


print(f'Total concepts:')
      
bin_concept_dict_total = get_bin_distribution(bin_dict,\
                                        concept_dicts_total, \
                                        concept_info_dict,\
                                        sampling_name)
       
for b, concepts in bin_concept_dict_total.items():
    print(b, len(concepts))
    
print()
print(f'Concepts included:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_include, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    


print()
print(f'Concepts excluded:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_exclude, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print('bin', b, ':', len(concepts), 'percentage of total in the bin:', round(len(concepts)/len(bin_concept_dict_total[b]), 2))

Distribution of aoa

Bins:
0 [125.0, 315.66666666666663]
1 [315.66666666666663, 506.3333333333333]
2 [506.3333333333333, 697.0]

Total concepts:
None 151
0 16
1 11
2 1

Concepts included:
None 134
0 14
1 8
2 1

Concepts excluded:
bin 1 : 3 percentage of total in the bin: 0.27
bin None : 17 percentage of total in the bin: 0.11
bin 0 : 2 percentage of total in the bin: 0.12


In [98]:
# inspect fam
sampling_name = 'conc' 

print(f'Distribution of {sampling_name}')
print()
print('Bins:')
for n, bin_interval in enumerate(bin_dict[sampling_name]['bins']):
    print(n, bin_interval)
print()


print(f'Total concepts:')
      
bin_concept_dict_total = get_bin_distribution(bin_dict,\
                                        concept_dicts_total, \
                                        concept_info_dict,\
                                        sampling_name)
       
for b, concepts in bin_concept_dict_total.items():
    print(b, len(concepts))
    
print()
print(f'Concepts included:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_include, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    


print()
print(f'Concepts excluded:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_exclude, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print('bin', b, ':', len(concepts), 'percentage of total in the bin:', round(len(concepts)/len(bin_concept_dict_total[b]), 2))

Distribution of conc

Bins:
0 [158.0, 328.66666666666663]
1 [328.66666666666663, 499.3333333333333]
2 [499.3333333333333, 670.0]

Total concepts:
2 61
None 111
1 6
0 1

Concepts included:
2 56
None 97
1 4

Concepts excluded:
bin 2 : 5 percentage of total in the bin: 0.08
bin 1 : 2 percentage of total in the bin: 0.33
bin None : 14 percentage of total in the bin: 0.13
bin 0 : 1 percentage of total in the bin: 1.0


In [118]:
# inspect polysemy
sampling_name = 'polysemy' 

print(f'Distribution of {sampling_name}')
print()
print('Bins:')
for n, bin_interval in enumerate(bin_dict[sampling_name]['bins']):
    print(n, bin_interval)
print()


print(f'Total concepts:')
      
bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_total, \
                                        concept_info_dict,\
                                        sampling_name)
       
for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    
print()
print(f'Concepts included:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_include, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))
    

print()
print(f'Concepts excluded:')     

bin_concept_dict = get_bin_distribution(bin_dict,\
                                        concept_dicts_exclude, \
                                        concept_info_dict,\
                                        sampling_name)

for b, concepts in bin_concept_dict.items():
    print(b, len(concepts))

Distribution of polysemy

Bins:
0 mon
1 met
2 poly_metonymy
3 homonym

Total concepts:
mon 43
None 64
poly_metonymy 59
homonym 13

Concepts included:
mon 37
None 54
poly_metonymy 55
homonym 11

Concepts excluded:
poly_metonymy 4
homonym 2
None 10
mon 6
