In [None]:
import os
import os.path as osp
from itertools import product
import pandas as pd
import numpy as np
from glob import glob
from tqdm.auto import tqdm
import re

import gensim.downloader as api

def replace_invalid_with_nan(vector):
    return np.where(vector==0.0, np.nan, vector)

def glove_vector(words, wv):
    return wv.get_mean_vector(words)

def add_cn_prefix(word):
    return f'/c/en/{word}'

def cn_vector(words, wv):
    if len(words) > 1:
        compound = add_cn_prefix('_'.join(words))
        if wv.__contains__(compound):
            return wv.get_mean_vector([compound])
    return wv.get_mean_vector([add_cn_prefix(word) for word in words])
    
def get_static_embed(word, wv, mode):
    
    if type(word) != str:
        # fallback for e.g. None inputs
        return np.full([wv.vector_size], np.nan)
    
    words = word.split()
    if mode=='glove':
        vector = glove_vector(words, wv)
    elif mode=='cn':
        vector = cn_vector(words, wv)
    
    if all(vector == 0.0):
        return replace_invalid_with_nan(vector)
    return vector

def unpack_anns(list_of_annotations):
    return [(i, x['whole']['wholeAnnotation']) for i, x in enumerate(list_of_annotations)]

tqdm.pandas()

In [None]:
twostep_predictions = True
fewshot_predictions = False

processed_dir = osp.abspath('../predicted_data/processed')
input_file = osp.join(processed_dir, f'processed_predictions{"_twostep" if twostep_predictions else ""}{"_fewshot" if fewshot_predictions else ""}.csv')

output_dir = osp.join(processed_dir, 'static_encodings')
if not osp.isdir(output_dir):
    print(f'make directory {output_dir}')
    os.makedirs(output_dir)

pred_df = pd.read_csv(input_file, index_col=0)

response_cols = [c for c in pred_df.columns if c.startswith('response_')]
models = [c.replace('response_', '') for c in response_cols]

tangrams = sorted(pd.unique(pred_df.tangram))
scenes = sorted(pd.unique(pred_df.scene))

In [None]:
# leave out raw and clean anns

idx_array = pred_df.item_identifyer.values
cols = ['response', 'label']

for emb_model, emb_model_name in [
        ('glove-wiki-gigaword-300', 'glove'),
        ('conceptnet-numberbatch-17-06-300', 'cn')
    ]:
    
    print(f'load embeddings for {emb_model_name} / {emb_model}')
    word_vectors = api.load(emb_model)
    def get_embeddings(word):
        return get_static_embed(word, wv=word_vectors, mode=emb_model_name)

    results = dict()

    for model, rl in tqdm(list(product(models, cols))):
        col=f'{rl}_{model}'
        print(f'calculate embeddings for {col}')
    
        embeds = pred_df[col].map(get_embeddings)
        results[col] = np.stack(embeds.to_list())

    outfile = osp.join(output_dir, f'{osp.splitext(input_file)[0]}_{emb_model_name}_embeddings.npz')
    print(outfile)
    np.savez(outfile, text_idx=idx_array, **results)