# Replace excluded concepts


## Basis: stratified sampling

## Steps:
* vocabulary information has been updated with mrc norms for all concepts (also for the ones extracted from the space) :check: 
* recreated bins on updated data --> copy to the this repo :check:
* rerun concept dataset lexical information script :check:
* move new data to current repo - check this!
* analyze dataset in terms of bins :check:
* draw from remaining candidates in underrepresented bins  :check:
* make random aspect replicable


Move to py scripts and:
* restructure data respository and make sure scripts align
* write new set to files in data repository




In [1]:
import json
import csv
import numpy as np
import math
from collections import Counter
import random

In [12]:
def load_lexical_data():
    # this is what we originally sampled from
    path = '../../data_lexical_info/all_lodce_mrc.csv'
    
    with open(path) as infile:
        dicts = list(csv.DictReader(infile))
    word_info_dict = defaultdict(list)
    for d in dicts:
        word = d['word']
        word_info_dict[word].append(d)
    return word_info_dict


def get_concepts_set(p, col):
    concept_info_dict = dict()

    path = f'../../data_all_candidates/concepts_additional_info/{col}/{p}.csv'

    with open(path) as infile:
        reader = csv.DictReader(infile)
        dicts = list(reader)

    for d in dicts:
        concept = d['lemma']
        filter_dec = d['filter']
        if filter_dec == 'True':
            concept_info_dict[concept] = d
    return concept_info_dict



def load_general_bins():
    with open('../../vocabulary_data/bins_updated.json') as infile:
        bin_dict_general = json.load(infile)
    return bin_dict_general

def load_cosine_bins_prop(set_info_dict):
    #set_info_dict = get_concepts_set(p, col)
    cosines = [float(d['cosine_centroid']) for c, d in set_info_dict.items()]
    values, bin_intervals = np.histogram(cosines, bins = 3)
    bin_dict_cos = bins_to_dict('cosine_centroid', values, bin_intervals)
    return bin_dict_cos
 
def bins_to_dict(name, values, bin_intervals, 
                 mapping=None, restriction=None, 
                 bin_type='distribution'):

    bin_dict = dict()
    bin_dict[name] =  {
    'type' : bin_type,
    'mapping' : mapping,
    'bins' : [],
    'frequencies' : [int(f) for f in list(values)],
    'restriction' : restriction
    }


    for n, i in enumerate(bin_intervals):
        if n != len(bin_intervals) - 1:
            bin_dict[name]['bins'].append((i, bin_intervals[n+1]))
        else:
            break
    return bin_dict


def assign_to_bin(concept_dict, bin_dict, name):

    #get_polysemy_info(concept_dict)

    if name == 'polysemy':
        concept_value = get_polysemy_info(concept_dict)
        target_bin = concept_value
    else:
        if concept_dict[name] != '':
            concept_value = float(concept_dict[name])
            if bin_dict[name]['mapping'] == 'log':
                concept_value = math.log(concept_value)
            n_bins = len(bin_dict[name]['bins'])
            for n, interval in enumerate(bin_dict[name]['bins']):
                start, end = interval
                if n < (n_bins-1):
                    if start <= concept_value < end:
                        target_bin = n
                        break
                    else:
                        target_bin = None
                else:
                    if start <= concept_value <= end:
                        target_bin = n
                        break
                    else:
                        target_bin = None
                    
        else:
            target_bin = None
    return target_bin


def get_polysemy_info(concept_dict):

    word = concept_dict['word']
    mipvu_met = concept_dict['mipvu']
    polysemy_type = concept_dict['polysemy_type']

    if polysemy_type == 'mon':
        poly = 'mon'
    elif polysemy_type == 'homonyms_also_same_pos':
        poly = 'homonym'
    elif mipvu_met == 'True':
        poly = 'met'
    # Possibly metonymy if not metaphor and not homonym
    # caveat: the metaphor annotations are not exhaustive
    elif polysemy_type == 'poly':
        poly = 'poly_metonymy'
    else:
        poly = None
    return poly


def get_bin_feature_dict(general_bin_dict, concept_dicts):
    concept_features_dict = dict()
    for concept_dict in concept_dicts:
        features_dict = dict()
        concept = concept_dict['lemma']
        for name in general_bin_dict.keys():
            target_bin = assign_to_bin(concept_dict, general_bin_dict, name)
            features_dict[name] = target_bin
        features_dict['label']  = concept_dict['label']
        concept_features_dict[concept] = features_dict
    return concept_features_dict


def get_ranked_bin_imbalances(general_bin_dict, set_bin_features, concepts_selected):
    name_diff_dict = dict()
    n_concepts = len(concepts_selected)
    
    bin_diff_tuples = []
    for name in general_bin_dict:
        bin_concept_cnt = Counter()
        n_bins = len(general_bin_dict[name]['bins'])
        n_equal_distribution = n_concepts/n_bins
        for concept in concepts_selected:
            f = set_bin_features[concept][name]
            bin_concept_cnt[f] += 1
        
        for bin_name, cnt in bin_concept_cnt.items():
            diff_to_equal = n_equal_distribution - cnt
            diff_to_equal_percent = diff_to_equal/n_concepts
            # only include if there are fewer concepts than expected:
            if diff_to_equal > 0:
                bin_diff_tuples.append((diff_to_equal_percent, name, bin_name))
    
    # sort from biggest to smallest:
    sorted_diff_name_tuples = sorted(bin_diff_tuples, reverse=True)
    return sorted_diff_name_tuples


def find_equivalents(set_bin_features, concepts_not_selected, concept_dicts_exclude):
    
    replacement_concepts = set()
    #no_replacement_found_concepts = set()
    
    features_not_selected = dict()
    for concept in concepts_not_selected:
        features_not_selected[concept] = set_bin_features[concept]

    for d in concept_dicts_exclude:
        concept = d['lemma']
        features = set_bin_features[concept]
        if features in features_not_selected.values():
            for concept_available, feats_available in features_not_selected.items():
                if features == feats_available:
                    replacement_concepts.add(concept_available)
                    break

        #else:
         #   no_replacement_found_concepts.add(concept)
    return replacement_concepts

In [13]:
# Create mapping between prop and collection name - load entire set
props_collection_dict = {'used_in_cooking': 'complex', 'warm': 'perceptual', 'black': 'perceptual'}
# used_in_cooking
p = 'used_in_cooking'
col = props_collection_dict[p]
set_info_dict = get_concepts_set(p, col)
for c, info_dict in set_info_dict.items():
    print(c, info_dict.keys())
    break
print(len(set_info_dict))

opener dict_keys(['label', 'categories_str', 'sources_str', 'certainty', 'cosine_centroid', 'manual_coarse_grained', 'space_selection', 'qumcrae_label', 'word', 'lemma', 'wiki_frequency', 'word_in_wn?', 'word_noun_in_wn?', 'word_noun_spacy?', 'n_navigli_clusters', 'n_onto_senses_n_v', 'n_wn_senses', 'min_wn_sim_wup', 'av_sim_wup', 'polysemy_type', 'mipvu', 'wn_abs_conc', 'filter', 'conc', 'fam', 'aoa'])
250


In [14]:
# get bins
general_bin_dict = load_general_bins()
bin_dict_cosine = load_cosine_bins_prop(set_info_dict)
general_bin_dict.update(bin_dict_cosine)

#general_bin_dict

In [15]:
test_concept, test_concept_dict = list(set_info_dict.items())[10]
print(test_concept)
print(test_concept_dict['cosine_centroid'])
target_bin = assign_to_bin(test_concept_dict, general_bin_dict, 'cosine_centroid')
target_bin

gavel
0.2067427267


0

In [16]:
# load_concept_dicts include and excluse
concept_dicts_exclude, concept_dicts_include =  get_excluded_included_concepts(p)
print(len(concept_dicts_include), len(concept_dicts_exclude))

concept_dicts_total = concept_dicts_include + concept_dicts_exclude
concepts_selected = set([d['lemma'] for d in concept_dicts_total])

total_concepts = set(set_info_dict.keys())
concepts_not_selected = total_concepts.difference(concepts_selected)
concept_dicts_not_selected = [d for c, d in set_info_dict.items()\
                              if c in concepts_not_selected]


# sanity check:
# should print empty set
print(concepts_selected.intersection(concepts_not_selected))
print(f'Concepts still available for sampling: {len(concept_dicts_not_selected)}')

157 22
set()
Concepts still available for sampling: 71


In [160]:
# sort concepts into bins:

# sort excluded concepts into bins
# sort remaining dataset into bins

# For each excluded concept, draw a new one from the same bin 
# If the same bin is empty, draw from another, smaller bin

In [17]:
# all bin names
print(len(concepts_selected))
set_bin_features = get_bin_feature_dict(general_bin_dict, set_info_dict.values())
direct_replacements = find_equivalents(set_bin_features, concepts_not_selected, concept_dicts_exclude)
print(direct_replacements)
concepts_selected.update(direct_replacements)
print(len(concepts_selected))
n_to_replace = len(concept_dicts_exclude) - len(direct_replacements)
print('Still to replace: ', n_to_replace)

179
{'puree', 'spreader', 'straightedge', 'vermicelli', 'bolo', 'pineapple'}
185
Still to replace:  16


In [18]:
# print imablance before sampling
# add labels info
general_bin_dict['label'] = {'bins': ['pos', 'neg', 'pos/neg', 'neg/pos']}
bins_sorted_original = get_ranked_bin_imbalances(general_bin_dict, set_bin_features, concepts_selected)

for b in bins_sorted_original:
    print(b)

(0.3279279279279279, 'conc', 0)
(0.3279279279279279, 'aoa', 2)
(0.3225225225225225, 'fam', 0)
(0.3009009009009009, 'conc', 1)
(0.27387387387387385, 'aoa', 1)
(0.26846846846846845, 'fam', 1)
(0.24684684684684682, 'aoa', 0)
(0.2445945945945946, 'label', 'pos/neg')
(0.17972972972972973, 'polysemy', 'homonym')
(0.13333333333333333, 'wiki_frequency', 1)
(0.057657657657657645, 'cosine_centroid', 1)
(0.04144144144144143, 'cosine_centroid', 0)
(0.019819819819819808, 'fam', 2)
(0.006756756756756757, 'polysemy', 'mon')


In [27]:
# try to balance bins by selecting a word from the one that is least balanced 

def resample_missing_concepts(bins_sorted_original, 
                              n_to_replace, 
                              concepts_not_selected,
                              concepts_selected,
                              set_bin_features):
    replacement_concepts = set()
    bins_sorted = bins_sorted_original
    while len(replacement_concepts) < n_to_replace:
        # get bin overview
        for bin_tuple in bins_sorted:
            if len(replacement_concepts) == n_to_replace:
                print('found enough!')
                break
            name = bin_tuple[1]
            bin_name = bin_tuple[2]
            concepts_not_selected_shuff = list(concepts_not_selected)
            random.shuffle(concepts_not_selected_shuff)
            # shuffle original concept list so it's not sorted by cosine distance
            for c in concepts_not_selected_shuff:
                features = set_bin_features[c]
                if len(replacement_concepts) == n_to_replace:
                    print('found enough inside!')
                    break
                if features[name] == bin_name:
                    print('replacement found in ', name, bin_name)
                    replacement_concepts.add(c)        
        concepts_selected.update(replacement_concepts)
        bins_sorted = get_ranked_bin_imbalances(general_bin_dict, set_bin_features, concepts_selected)
    return replacement_concepts

In [28]:
resample_missing_concepts(bins_sorted_original, 
                              n_to_replace, 
                              concepts_not_selected,
                              concepts_selected,
                              set_bin_features)

replacement found in  fam 1
replacement found in  aoa 0
replacement found in  wiki_frequency 1
replacement found in  wiki_frequency 1
replacement found in  wiki_frequency 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
replacement found in  cosine_centroid 1
found enough inside!
found enough!


{'bean',
 'bolo',
 'chopper',
 'crampon',
 'flour',
 'ham',
 'hoe',
 'plunger',
 'scallop',
 'scythe',
 'sledgehammer',
 'spinach',
 'spoon',
 'spreader',
 'strainer',
 'teaspoon'}

In [23]:
for b in bins_sorted:
    print(b)   

(0.3284552845528455, 'conc', 0)
(0.3284552845528455, 'aoa', 2)
(0.3235772357723577, 'fam', 0)
(0.30406504065040646, 'conc', 1)
(0.27967479674796747, 'aoa', 1)
(0.26991869918699185, 'fam', 1)
(0.2504065040650406, 'aoa', 0)
(0.2451219512195122, 'label', 'pos/neg')
(0.18658536585365854, 'polysemy', 'homonym')
(0.1382113821138211, 'wiki_frequency', 1)
(0.060162601626016235, 'cosine_centroid', 0)
(0.0260162601626016, 'fam', 2)
(0.016260162601625994, 'cosine_centroid', 1)
(0.0016260162601625784, 'conc', 2)
(0.0012195121951219512, 'polysemy', 'mon')


In [24]:
for b1, b2 in zip(bins_sorted_original, bins_sorted):
    print(b1, b2)

(0.3279279279279279, 'conc', 0) (0.3284552845528455, 'conc', 0)
(0.3279279279279279, 'aoa', 2) (0.3284552845528455, 'aoa', 2)
(0.3225225225225225, 'fam', 0) (0.3235772357723577, 'fam', 0)
(0.3009009009009009, 'conc', 1) (0.30406504065040646, 'conc', 1)
(0.27387387387387385, 'aoa', 1) (0.27967479674796747, 'aoa', 1)
(0.26846846846846845, 'fam', 1) (0.26991869918699185, 'fam', 1)
(0.24684684684684682, 'aoa', 0) (0.2504065040650406, 'aoa', 0)
(0.2445945945945946, 'label', 'pos/neg') (0.2451219512195122, 'label', 'pos/neg')
(0.17972972972972973, 'polysemy', 'homonym') (0.18658536585365854, 'polysemy', 'homonym')
(0.13333333333333333, 'wiki_frequency', 1) (0.1382113821138211, 'wiki_frequency', 1)
(0.057657657657657645, 'cosine_centroid', 1) (0.060162601626016235, 'cosine_centroid', 0)
(0.04144144144144143, 'cosine_centroid', 0) (0.0260162601626016, 'fam', 2)
(0.019819819819819808, 'fam', 2) (0.016260162601625994, 'cosine_centroid', 1)
(0.006756756756756757, 'polysemy', 'mon') (0.00162601626

In [38]:
for c in replacement_concepts:
    print(c)

flour
scallop
rhubarb
ham
edger
strainer
hoe
toaster
bolo
bean
spreader
chopper
grasshopper
teaspoon
razor
sledgehammer
crampon
spinach
scythe
parer
plunger
spoon
