In [None]:
import os.path as osp
import pandas as pd
from collections import Counter
import sys
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob

sys.path.append('../analyzing_annotations')
from analysis_utils import read_ann_df, clean_wl, naming_div, display_img

IMG_LOCATION=osp.abspath('../generated_items/')

data_dir = osp.abspath('../collected_data/processed')
kilogram_dir = osp.abspath('../kilogram')

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', None)

In [None]:
def make_synset(s):
    if type(s) == str:
        return wn.synset(s)
    return s

def get_hypernyms(synset, include_self=True):
    synset = make_synset(synset)
    hypernyms = {synset} if include_self else set()
    for hypernym in synset.hypernyms():
        hypernyms |= set(get_hypernyms(hypernym))
    return hypernyms | set(synset.hypernyms())

def get_hyponyms(synset, include_self=True):
    synset = make_synset(synset)
    hyponyms = {synset} if include_self else set()
    for hyponym in synset.hyponyms():
        hyponyms |= set(get_hyponyms(hyponym))
    return hyponyms | set(synset.hyponyms())

def is_hypernym_of(synset, *reference_synsets, include_self=True):
    synset = make_synset(synset)
    reference_hypernyms = set()
    for r in reference_synsets:
        reference_hypernyms |= get_hypernyms(make_synset(r), include_self=include_self)
    return synset in reference_hypernyms

def is_hyponym_of(synset, *reference_synsets, include_self=True):
    synset = make_synset(synset)
    reference_hyponyms = set()
    for r in reference_synsets:
        reference_hyponyms |= get_hyponyms(make_synset(r), include_self=include_self)
    return synset in reference_hyponyms

def get_first_lemma(synset):
    return make_synset(synset).lemma_names()[0]

In [None]:
input_file = osp.join(data_dir, 'valid_processed_collected_data.csv')
ann_df = read_ann_df(input_file)
ann_df.head_noun = ann_df.head_noun.apply(lambda x: x.split('/')[0].strip())

tangrams, scenes = zip(*ann_df.index)
tangrams = sorted(set(tangrams))
scenes = sorted(set(scenes))

tangram2idx = {t:i for i, t in enumerate(tangrams)}
idx2tangram = {i:t for t, i in tangram2idx.items()}

ann_df = ann_df.rename(columns={'comments': 'ann_comments'})

display(ann_df.head())

In [None]:
input_files = glob(osp.join(data_dir, 'valid_processed_synsets_*_human.csv'))
wn_dfs = [pd.read_csv(input_file, index_col=0) for input_file in input_files]
wn_anns = pd.concat(wn_dfs).rename(columns={'comments': 'wn_comments'})

assert set(wn_anns.item_identifyer.unique()) == set(ann_df.item_identifyer.unique())

# replace empty synsets with entity.n.01
wn_anns.selected_synset = wn_anns.selected_synset.fillna('entity.n.01')
# map synset strings to synsets
wn_anns['selected_synset_obj'] = wn_anns.selected_synset.map(wn.synset)
# update definitions
wn_anns.synset_definition = wn_anns.selected_synset_obj.map(lambda x: x.definition())
# normalize head noun using WordNet
wn_anns['wn_lemma'] = wn_anns.selected_synset_obj.map(get_first_lemma)
# replace head_noun entries with corrected versions
corrected_mask = ~wn_anns.corrected_head_noun.isna()
wn_anns[corrected_mask].head_noun = wn_anns[corrected_mask].corrected_head_noun

wn_anns.head()

In [None]:
ref_columns = [
    'item_identifyer', 'tangram', 'scene', 'raw_annotation', 'ann_comments', 
    'tangram_id', 'kilogram_snd', 'item_id', 'workspace_name', 'partition_name', 
    'dataset_name', 'tangram_pos', 'image_url', 'meta_record', 'user_name', 
    'status', 'time', 'valid', 'order_idx']

wn_columns = [
    'item_identifyer', 'clean_annotation', 'head_noun', 'wn_lemma',
    'selected_synset', 'synset_definition', 'wn_comments']

col_order = [
    'item_identifyer', 'tangram', 'scene', 'raw_annotation',  'clean_annotation',  
    'head_noun', 'wn_lemma',  'selected_synset',  'synset_definition',  'tangram_id', 
    'item_id',  'image_url',  'ann_comments',  'wn_comments', 'kilogram_snd',  
    'workspace_name',  'partition_name', 'dataset_name',  'tangram_pos',  'user_name', 
    'meta_record',  'time',  'order_idx',  'status',  'valid']

merged_df = pd.merge(
    left=ann_df.reset_index()[ref_columns],
    right=wn_anns[wn_columns],
    left_on='item_identifyer',
    right_on='item_identifyer'
)[col_order]

out_path = osp.join(data_dir, 'final_processed_data.csv')
merged_df.to_csv(out_path)

In [None]:
merged_df.head()

In [None]:
merged_df.loc[merged_df.selected_synset == 'entity.n.01']

In [None]:
wn_anns.wn_lemma.value_counts().iloc[:30][::-1].plot(kind='barh')
plt.grid()

## \# Entries per Semantic Group

In [None]:
entries_per_synset = {
    'person.n.01': False,
    'animal.n.01': False,
    'artifact.n.01': False
}

for ref_synset in entries_per_synset.keys():
    ref_hyponyms = get_hyponyms(ref_synset)
    hyponym_mask = wn_anns.selected_synset_obj.isin(ref_hyponyms)
    hyponym_entries = wn_anns[hyponym_mask]
    entries_per_synset[ref_synset] = len(hyponym_entries)

In [None]:
entries_per_synset

## \# Hyponyms per Synset

In [None]:
synset_query = {s for s in wn.all_synsets() if s.pos() == 'n'}
wn_analysis_results = list()
pbar = tqdm()

while len(synset_query) > 0:
    
    ref_synset = next(iter(synset_query))
    
    ref_hyponyms = get_hyponyms(ref_synset)

    is_hyponym_mask = wn_anns.selected_synset_obj.map(lambda x: x in ref_hyponyms)
    n_hyponyms = len(wn_anns[is_hyponym_mask])
    hyponym_ratio = n_hyponyms / len(wn_anns)
    
    if n_hyponyms > 0:
        
        wn_analysis_results.append({
            'synset': ref_synset.name(),
            'n_hyponyms': n_hyponyms,
            'hyponym_ratio': hyponym_ratio
        })
        
        synset_query.remove(ref_synset)
        
    else:
        synset_query -= ref_hyponyms
        
    pbar.update()
pbar.close()

In [None]:
results_df = pd.DataFrame(wn_analysis_results)
results_df = results_df.set_index('synset').sort_values(by='hyponym_ratio', ascending=False)
results_df.iloc[:25]