In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from glob import glob
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from tqdm.auto import tqdm
import re

tqdm.pandas()

In [2]:
clip_path = osp.abspath('../analyzing_annotations/clip_model')

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir=clip_path)
print('model: success')
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=clip_path)
print('processor: success')

model: success
processor: success


In [3]:
def get_text_features(s, clip_model=model, clip_processor=processor, return_numpy=True):
    inputs = clip_processor(text=s, return_tensors='pt')
    outputs = clip_model.get_text_features(**inputs).squeeze()
    return outputs.detach().numpy() if return_numpy else outputs


def get_image_features(img, clip_model=model, clip_processor=processor, return_numpy=True):
    inputs = clip_processor(images=img, return_tensors='pt')
    outputs = clip_model.get_image_features(**inputs).squeeze()
    return outputs.detach().numpy() if return_numpy else outputs


def get_features_for_img_path(img_path, clip_model=model, clip_processor=processor, return_numpy=True):
    img = Image.open(img_path)
    return get_image_features(img, clip_model, clip_processor, return_numpy)

In [4]:
input_dir = osp.abspath('../collected_data/')
processed_dir = osp.join(input_dir, 'processed')
kilogram_dir = osp.abspath('../kilogram')

output_dir = osp.join(processed_dir, 'clip_encodings')
if not osp.isdir(output_dir):
    print(f'make directory {output_dir}')
    os.makedirs(output_dir)

In [5]:
def clean_str(s): 
    return s.strip().lower()

def clean_raw_ann(s):
    s = re.sub(r'[^a-zA-Z0-9 ]', '', s)
    return s.strip().lower()

input_file = osp.join(processed_dir, 'final_processed_data.csv')

ann_df = pd.read_csv(input_file, index_col=0)

ann_df.raw_annotation = ann_df.raw_annotation.map(clean_raw_ann)
ann_df.clean_annotation = ann_df.clean_annotation.map(clean_str)
ann_df.head_noun = ann_df.head_noun.map(clean_str)

tangrams = sorted(pd.unique(ann_df.tangram))
scenes = sorted(pd.unique(ann_df.scene))

In [6]:
idx_array = ann_df[['item_identifyer', 'tangram', 'scene']].values

raw_embeds_series = ann_df['raw_annotation'].progress_map(get_text_features)
raw_embeds = np.stack(raw_embeds_series.to_list())

clean_embeds_series = ann_df['clean_annotation'].progress_map(get_text_features)
clean_embeds = np.stack(clean_embeds_series.to_list())

head_embeds_series = ann_df['head_noun'].progress_map(get_text_features)
head_embeds = np.stack(head_embeds_series.to_list())

wn_embeds_series = ann_df['wn_lemma'].progress_map(get_text_features)
wn_embeds = np.stack(wn_embeds_series.to_list())

outfile = osp.join(output_dir, './ann_clip_embeddings.npz')
np.savez(outfile, text_idx=idx_array, text_raw_emb=raw_embeds, text_clean_emb=clean_embeds, text_head_emb=head_embeds, text_wn_emb=wn_embeds)

  0%|          | 0/4070 [00:00<?, ?it/s]

  0%|          | 0/4070 [00:00<?, ?it/s]

  0%|          | 0/4070 [00:00<?, ?it/s]

  0%|          | 0/4070 [00:00<?, ?it/s]

In [7]:
def unpack_anns(list_of_annotations):
    return [(i, x['whole']['wholeAnnotation']) for i, x in enumerate(list_of_annotations)]

for split in 'dense', 'dense10':
    print(f'extracting features for {split} split')

    kilogram_path = osp.join(kilogram_dir, 'dataset', f'{split}.json')
    kilogram_df = pd.read_json(kilogram_path).T

    kilogram_df = kilogram_df.loc[tangrams]
    kilogram_df['annotation_tuples'] = kilogram_df.annotations.map(unpack_anns)

    exploded_kilogram_df = kilogram_df.explode('annotation_tuples')
    exploded_kilogram_df['ann_idx'] = exploded_kilogram_df.annotation_tuples.map(lambda x: x[0])
    exploded_kilogram_df['ann'] = exploded_kilogram_df.annotation_tuples.map(lambda x: x[1])

    exploded_kilogram_df['item_identifyer'] = exploded_kilogram_df.apply(lambda x: f'{x.name}_{x.ann_idx}', axis=1)
    exploded_kilogram_df = exploded_kilogram_df.reset_index().rename(columns={'index': 'tangram'})

    # extract features

    idx_array = exploded_kilogram_df[['item_identifyer', 'tangram', 'ann_idx']].values

    embeds_series = exploded_kilogram_df['ann'].progress_map(get_text_features)
    embeds = np.stack(embeds_series.to_list())

    outfile = osp.join(output_dir, f'./{split}_clip_embeddings.npz')
    np.savez(outfile, text_idx=idx_array, text_emb=embeds)

extracting features for dense split


  0%|          | 0/1978 [00:00<?, ?it/s]

extracting features for dense10 split


  0%|          | 0/370 [00:00<?, ?it/s]

In [8]:
images = sorted(glob('../generated_scenes/*'))

idx_array = np.array([osp.split(path)[-1] for path in images])

embeds_list = [get_features_for_img_path(path) for path in tqdm(images)]
embeds = np.stack(embeds_list)

outfile = osp.join(output_dir, './scene_clip_embeddings.npz')
np.savez(outfile, img_idx=idx_array, img_emb=embeds)

  0%|          | 0/44 [00:00<?, ?it/s]