# Export Data for CLOOB Ablation Study
### This notebook exports data for the CLOOB ablation analysis done after the interactive article was accepted by VISxAI. 

In [None]:
! pip install git+https://github.com/ginihumer/Amumo.git

In [3]:
import amumo
from amumo import data as am_data
from amumo import utils as am_utils
from amumo import model as am_model

In [4]:
import os
def create_dir_if_not_exists(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    return dir

In [5]:
export_directory = './exported_data_checkpoints/'
create_dir_if_not_exists(export_directory)

'./exported_data_checkpoints/'

### Text-Image

In [34]:


def export_data(dataset_name, images, prompts, models):

    # create folder structure
    dataset_directory = create_dir_if_not_exists(export_directory + dataset_name)
    similarities_dir = create_dir_if_not_exists(dataset_directory + '/similarities')

    # export projections and similarities
    import torch
    from sklearn.decomposition import PCA
    from openTSNE import TSNE
    from umap import UMAP
    import numpy as np
    import pandas as pd
    import json

    # if there already exists a dataset with projections from prior exports, load it
    if not os.path.exists(dataset_directory + '/projections.csv'):
        projections_df = pd.DataFrame({'emb_id': list(np.arange(0,len(images),1))+list(np.arange(0,len(prompts),1)), 'data_type':['image']*len(images)+['text']*len(prompts)})
    else:
        projections_df = pd.read_csv(dataset_directory + '/projections.csv')
    

    for model in models:
        # compute embeddings
        image_embedding_gap, text_embedding_gap, logit_scale = am_utils.get_embedding(model, dataset_name, images, prompts)
        image_embedding_nogap, text_embedding_nogap = am_utils.get_closed_modality_gap(image_embedding_gap, text_embedding_gap)
        
        for image_embedding, text_embedding, mode in [(image_embedding_gap, text_embedding_gap, ''), (image_embedding_nogap, text_embedding_nogap, '_nogap')]:
            
            # compute similarities
            similarity_image_text, similarity = am_utils.get_similarity(image_embedding, text_embedding)
            np.savetxt('%s/%s%s.csv'%(similarities_dir,model.model_name,mode), similarity, delimiter=',')
            
            # compute meta information and similarity clustering
            meta_info = {}
            meta_info['gap_distance'] = float(am_utils.get_modality_distance(image_embedding, text_embedding))
            meta_info['loss'] = float(am_utils.calculate_val_loss(image_embedding, text_embedding, logit_scale.exp()))

            idcs, clusters, clusters_unsorted = am_utils.get_cluster_sorting(similarity_image_text)
            cluster_labels = []
            cluster_sizes = []
            for c in set(clusters):
                cluster_size = int(np.count_nonzero(clusters==c))
                cluster_label = am_utils.get_textual_label_for_cluster(np.where(clusters_unsorted==c)[0], prompts)
                cluster_labels.append(cluster_label)
                cluster_sizes.append(cluster_size)

            idcs_reverse = np.argsort(idcs)
            meta_info['cluster_sort_idcs'] = idcs.tolist()
            meta_info['cluster_sort_idcs_reverse'] = idcs_reverse.tolist()
            meta_info['cluster_sizes'] = cluster_sizes
            meta_info['cluster_labels'] = cluster_labels
            # print(meta_info)

            with open("%s/%s%s_meta_info.json"%(similarities_dir, model.model_name, mode), "w") as file:
                json.dump(meta_info, file)

            # compute projections
            embedding = np.array(torch.concatenate([image_embedding, text_embedding]))

            projection_methods = {
                'PCA': PCA,
                'UMAP': UMAP,
                'TSNE': TSNE
            }
            for method in projection_methods.keys():
                if method == 'PCA':
                    proj = projection_methods[method](n_components=2)
                else:
                    proj = projection_methods[method](n_components=2, metric='cosine', random_state=31415)
                
                if method == 'TSNE':
                    low_dim_data = proj.fit(embedding)
                else:
                    low_dim_data = proj.fit_transform(embedding)
                
                projections_df['%s%s_%s_x'%(model.model_name, mode, method)] = low_dim_data[:,0]
                projections_df['%s%s_%s_y'%(model.model_name, mode, method)] = low_dim_data[:,1]


    projections_df.to_csv(dataset_directory + '/projections.csv')

In [35]:

# reuse mscoco subset from previous analysis
from PIL import Image
import numpy as np

class Custom_Dataset(am_data.DatasetInterface):
    name = 'MSCOCO-Val'

    def __init__(self, path, seed=54, batch_size=None):
        # create triplet dataset if it does not exist
        super().__init__(path, seed, batch_size)
        # path: path to the triplet dataset
        image_paths = [path + "images/%i.jpg"%i for i in range(100)]

        all_images = []
        for image_path in image_paths:
            with open(image_path, "rb") as fopen:
                image = Image.open(fopen).convert("RGB")
                all_images.append(image)

        self.all_images = np.array(all_images)
        
        with open(path + "/prompts.txt", "r") as file:
            self.all_prompts = file.read().splitlines()

mscoco_val_dataset_name = "MSCOCO-Val_size-100"
dataset_mscoco_val = Custom_Dataset(export_directory + mscoco_val_dataset_name + '/')
mscoco_val_images, mscoco_val_prompts = dataset_mscoco_val.get_data()

  self.all_images = np.array(all_images)
  self.all_images = np.array(all_images)


In [37]:
# TODO: export data for the models from the ablation study
export_data(mscoco_val_dataset_name, mscoco_val_images, mscoco_val_prompts, [am_model.ImageBind_Model()])

found cached embeddings for MSCOCO-Val_size-100_ImageBind_huge
