In [None]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from glob import glob
from tqdm.auto import tqdm
import re

import gensim.downloader as api

def replace_invalid_with_nan(vector):
    return np.where(vector==0.0, np.nan, vector)

def glove_vector(words, wv):
    return wv.get_mean_vector(words)

def add_cn_prefix(word):
    return f'/c/en/{word}'

def cn_vector(words, wv):
    if len(words) > 1:
        compound = add_cn_prefix('_'.join(words))
        if wv.__contains__(compound):
            return wv.get_mean_vector([compound])
    return wv.get_mean_vector([add_cn_prefix(word) for word in words])
    
def get_static_embed(word, wv, mode):
    words = word.split()
    if mode=='glove':
        vector = glove_vector(words, wv)
    elif mode=='cn':
        vector = cn_vector(words, wv)
    
    if all(vector == 0.0):
        return replace_invalid_with_nan(vector)
    return vector

def unpack_anns(list_of_annotations):
    return [(i, x['whole']['wholeAnnotation']) for i, x in enumerate(list_of_annotations)]

tqdm.pandas()

In [None]:
input_dir = osp.abspath('../collected_data/')
processed_dir = osp.join(input_dir, 'processed')
kilogram_dir = osp.abspath('../kilogram')

output_dir = osp.join(processed_dir, 'static_encodings')
if not osp.isdir(output_dir):
    print(f'make directory {output_dir}')
    os.makedirs(output_dir)

In [None]:
def clean_str(s): 
    return s.strip().lower()

def clean_raw_ann(s):
    s = re.sub(r'[^a-zA-Z0-9 ]', '', s)
    return s.strip().lower()

input_file = osp.join(processed_dir, 'final_processed_data.csv')

ann_df = pd.read_csv(input_file, index_col=0)

ann_df.raw_annotation = ann_df.raw_annotation.map(clean_raw_ann)
ann_df.clean_annotation = ann_df.clean_annotation.map(clean_str)
ann_df.head_noun = ann_df.head_noun.map(clean_str)

tangrams = sorted(pd.unique(ann_df.tangram))
scenes = sorted(pd.unique(ann_df.scene))

In [None]:
# leave out raw and clean anns

for model, model_name in [
        ('glove-wiki-gigaword-300', 'glove'),
        ('conceptnet-numberbatch-17-06-300', 'cn')
    ]:
    
    print(f'load embeddings for {model_name} / {model}')
    word_vectors = api.load(model)
    def get_embeddings(word):
        return get_static_embed(word, wv=word_vectors, mode=model_name)

    print('calculate ann embeddings')
    ann_idx_array = ann_df[['item_identifyer', 'tangram', 'scene']].values

    head_embeds_series = ann_df['head_noun'].progress_map(get_embeddings)
    head_embeds = np.stack(head_embeds_series.to_list())

    wn_embeds_series = ann_df['wn_lemma'].progress_map(get_embeddings)
    wn_embeds = np.stack(wn_embeds_series.to_list())

    outfile = osp.join(output_dir, f'./ann_{model_name}_embeddings.npz')
    print(f'write to file: {outfile}')
    np.savez(outfile, text_idx=ann_idx_array, text_head_emb=head_embeds, text_wn_emb=wn_embeds)
    
    
    print('calculate tangram embeddings')
    for split in 'dense', 'dense10':
        print(f'extracting features for {split} split')

        kilogram_path = osp.join(kilogram_dir, 'dataset', f'{split}.json')
        kilogram_df = pd.read_json(kilogram_path).T

        kilogram_df = kilogram_df.loc[tangrams]
        kilogram_df['annotation_tuples'] = kilogram_df.annotations.map(unpack_anns)

        exploded_kilogram_df = kilogram_df.explode('annotation_tuples')
        exploded_kilogram_df['ann_idx'] = exploded_kilogram_df.annotation_tuples.map(lambda x: x[0])
        exploded_kilogram_df['ann'] = exploded_kilogram_df.annotation_tuples.map(lambda x: x[1])

        exploded_kilogram_df['item_identifyer'] = exploded_kilogram_df.apply(lambda x: f'{x.name}_{x.ann_idx}', axis=1)
        exploded_kilogram_df = exploded_kilogram_df.reset_index().rename(columns={'index': 'tangram'})

        # extract features

        kg_idx_array = exploded_kilogram_df[['item_identifyer', 'tangram', 'ann_idx']].values

        embeds_series = exploded_kilogram_df['ann'].progress_map(get_embeddings)
        embeds = np.stack(embeds_series.to_list())

        outfile = osp.join(output_dir, f'./{split}_{model_name}_embeddings.npz')
        print(f'write to file: {outfile}')
        np.savez(outfile, text_idx=kg_idx_array, text_emb=embeds)
        
    print('calculate scene category embeddings')
    sc_idx_array = np.array(scenes)
    embeds_list = [get_embeddings(s.replace('_', ' ')) for s in sc_idx_array]
    embeds = np.stack(embeds_list)

    outfile = osp.join(output_dir, f'./scene_{model_name}_embeddings.npz')
    print(f'write to file: {outfile}')
    np.savez(outfile, img_idx=sc_idx_array, img_emb=embeds)