In [1]:
import clipexplorer
from clipexplorer import data as ce_data
from clipexplorer import utils as ce_utils

In [2]:
# Data Helpers
def get_data_helper(dataset, filters=[], method=any):
    all_images, all_prompts = dataset.get_filtered_data(filters, method=method)
    print(len(all_images))

    dataset_name = dataset.name
    if len(filters) > 0:
        dataset_name = dataset_name + '_filter-' + method.__name__ + '_' + '-'.join(filters)
    else:
        dataset_name = dataset_name + '_size-%i'%len(all_images)

    return all_images, all_prompts, dataset_name

In [3]:
import os
def create_dir_if_not_exists(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    return dir

In [4]:
export_directory = './exported_data_checkpoints/'

def export_data(dataset_name, images, prompts):

    # create folder structure
    dataset_directory = create_dir_if_not_exists(export_directory + dataset_name)
    images_dir = create_dir_if_not_exists(dataset_directory + '/images')
    similarities_dir = create_dir_if_not_exists(dataset_directory + '/similarities')

    # save images
    for i in range(len(images)):
        im = images[i]
        im.resize((400,400))
        im.save('%s/%i.jpg'%(images_dir,i))

    # save texts
    with open(dataset_directory + "/prompts.txt", "w") as file:
        for prompt in prompts:
            file.write(prompt + "\n")

    # export projections and similarities
    import torch
    from sklearn.decomposition import PCA
    from openTSNE import TSNE
    from umap import UMAP
    import numpy as np
    import pandas as pd
    import json

    projections_df = pd.DataFrame({'emb_id': list(np.arange(0,len(images),1))+list(np.arange(0,len(prompts),1)), 'data_type':['image']*len(images)+['text']*len(prompts)})


    for model in ['CLIP', 'CyCLIP', 'CLOOB', 'CLOOB_LAION400M']:
        # compute embeddings
        image_embedding_gap, text_embedding_gap, logit_scale = ce_utils.get_embedding(model, dataset_name, images, prompts)
        image_embedding_nogap, text_embedding_nogap = ce_utils.get_closed_modality_gap(image_embedding_gap, text_embedding_gap)
        
        for image_embedding, text_embedding, mode in [(image_embedding_gap, text_embedding_gap, ''), (image_embedding_nogap, text_embedding_nogap, '_nogap')]:
            
            # compute similarities
            similarity_image_text, similarity = ce_utils.get_similarity(image_embedding, text_embedding)
            np.savetxt('%s/%s%s.csv'%(similarities_dir,model,mode), similarity, delimiter=',')
            
            # compute meta information and similarity clustering
            meta_info = {}
            meta_info['gap_distance'] = float(ce_utils.get_modality_distance(image_embedding, text_embedding))
            meta_info['loss'] = float(ce_utils.calculate_val_loss(image_embedding, text_embedding, logit_scale.exp()))

            idcs, clusters, clusters_unsorted = ce_utils.get_cluster_sorting(similarity_image_text)
            cluster_labels = []
            cluster_sizes = []
            for c in set(clusters):
                cluster_size = int(np.count_nonzero(clusters==c))
                cluster_label = ce_utils.get_textual_label_for_cluster(np.where(clusters_unsorted==c)[0], prompts)
                cluster_labels.append(cluster_label)
                cluster_sizes.append(cluster_size)

            idcs_reverse = np.argsort(idcs)
            meta_info['cluster_sort_idcs'] = idcs.tolist()
            meta_info['cluster_sort_idcs_reverse'] = idcs_reverse.tolist()
            meta_info['cluster_sizes'] = cluster_sizes
            meta_info['cluster_labels'] = cluster_labels
            # print(meta_info)

            with open("%s/%s%s_meta_info.json"%(similarities_dir, model, mode), "w") as file:
                json.dump(meta_info, file)

            # compute projections
            embedding = np.array(torch.concatenate([image_embedding, text_embedding]))

            projection_methods = {
                'PCA': PCA,
                'UMAP': UMAP,
                'TSNE': TSNE
            }
            for method in projection_methods.keys():
                if method == 'PCA':
                    proj = projection_methods[method](n_components=2)
                else:
                    proj = projection_methods[method](n_components=2, metric='cosine', random_state=31415)
                
                if method == 'TSNE':
                    low_dim_data = proj.fit(embedding)
                else:
                    low_dim_data = proj.fit_transform(embedding)
                
                projections_df['%s%s_%s_x'%(model, mode, method)] = low_dim_data[:,0]
                projections_df['%s%s_%s_y'%(model, mode, method)] = low_dim_data[:,1]


    projections_df.to_csv(dataset_directory + '/projections.csv')

In [5]:
# subset of mscoco val dataset
dataset_mscoco_val = ce_data.MSCOCO_Val_Dataset(path='/Users/christina/Data/mscoco/validation/', batch_size=100) # TODO: update to a relative path
mscoco_val_images, mscoco_val_prompts, mscoco_val_dataset_name = get_data_helper(dataset_mscoco_val, filters=[], method=any)
export_data(mscoco_val_dataset_name, mscoco_val_images, mscoco_val_prompts)

# subset of diffusionDB data
dataset_diffusiondb = ce_data.DiffusionDB_Dataset(path="2m_first_1k", batch_size=100)
diffusiondb_images, diffusiondb_prompts, diffusiondb_dataset_name = get_data_helper(dataset_diffusiondb)
export_data(diffusiondb_dataset_name, diffusiondb_images, diffusiondb_prompts)

# Analyse filtered subset
dataset_mscoco_val = ce_data.MSCOCO_Val_Dataset(path='/Users/christina/Data/mscoco/validation/', batch_size=100) # TODO: update to a relative path
mscoco_val_images_dogs, mscoco_val_prompts_dogs, mscoco_val_dataset_dogs_name = get_data_helper(dataset_mscoco_val, filters=['dog'], method=any) 
export_data(mscoco_val_dataset_dogs_name, mscoco_val_images_dogs, mscoco_val_prompts_dogs)

example_image_dir = create_dir_if_not_exists(export_directory + 'example_images/')
for img_id in range(10):
    # thumb = diffusiondb_images[img_id].copy()
    # thumb.thumbnail((100,100))
    # thumb.save(example_image_dir+str(img_id)+'.jpg')
    
    # Analyse rotated image
    dataset_rotated = ce_data.Rotate_Dataset(diffusiondb_images[img_id], diffusiondb_prompts[img_id], id=img_id)
    rotated_images, rotated_prompts, rotated_dataset_name = get_data_helper(dataset_rotated)
    export_data(rotated_dataset_name, rotated_images, rotated_prompts)

    # Analyze noisy image
    dataset_noise = ce_data.Noise_Dataset(diffusiondb_images[img_id], diffusiondb_prompts[img_id], id=img_id)
    noise_images, noise_prompts, noise_dataset_name = get_data_helper(dataset_noise)
    export_data(noise_dataset_name, noise_images, noise_prompts)


loading annotations into memory...
Done (t=0.07s)
creating index...
index created!
100


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
2023-06-26 13:51:31.152567: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


found cached embeddings for MSCOCO-Val_size-100_CLIP_RN50
found cached embeddings for MSCOCO-Val_size-100_CyCLIP_RN50
Loading model from /Users/christina/Workspace/CLIP-explorer/clipexplorer/CLOOB_local/training/model_configs/RN50.json
found cached embeddings for MSCOCO-Val_size-100_CLOOB_RN50
found cached embeddings for MSCOCO-Val_size-100_CLOOB-LAION400M_ViT-B-16


Found cached dataset diffusiondb (/Users/christina/.cache/huggingface/datasets/poloclub___diffusiondb/2m_first_1k/0.9.1/b3bc1e64570dc7149af62c4bac49ecfbce16b683dd4fee083292fae1afa95f7c)


  0%|          | 0/1 [00:00<?, ?it/s]

100
found cached embeddings for DiffusionDB_size-100_CLIP_RN50
found cached embeddings for DiffusionDB_size-100_CyCLIP_RN50
Loading model from /Users/christina/Workspace/CLIP-explorer/clipexplorer/CLOOB_local/training/model_configs/RN50.json
found cached embeddings for DiffusionDB_size-100_CLOOB_RN50
found cached embeddings for DiffusionDB_size-100_CLOOB-LAION400M_ViT-B-16
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
100
found cached embeddings for MSCOCO-Val_filter-any_dog_CLIP_RN50
found cached embeddings for MSCOCO-Val_filter-any_dog_CyCLIP_RN50
Loading model from /Users/christina/Workspace/CLIP-explorer/clipexplorer/CLOOB_local/training/model_configs/RN50.json
found cached embeddings for MSCOCO-Val_filter-any_dog_CLOOB_RN50
found cached embeddings for MSCOCO-Val_filter-any_dog_CLOOB-LAION400M_ViT-B-16


  self.all_images = np.array(images)
  self.all_images = np.array(images)


100
found cached embeddings for Rotated-0_size-100_CLIP_RN50
found cached embeddings for Rotated-0_size-100_CyCLIP_RN50
Loading model from /Users/christina/Workspace/CLIP-explorer/clipexplorer/CLOOB_local/training/model_configs/RN50.json
found cached embeddings for Rotated-0_size-100_CLOOB_RN50
found cached embeddings for Rotated-0_size-100_CLOOB-LAION400M_ViT-B-16


  self.all_images = np.array(images)
  self.all_images = np.array(images)


100
found cached embeddings for Noisy-0_size-100_CLIP_RN50
found cached embeddings for Noisy-0_size-100_CyCLIP_RN50
Loading model from /Users/christina/Workspace/CLIP-explorer/clipexplorer/CLOOB_local/training/model_configs/RN50.json
found cached embeddings for Noisy-0_size-100_CLOOB_RN50
found cached embeddings for Noisy-0_size-100_CLOOB-LAION400M_ViT-B-16


  self.all_images = np.array(images)
  self.all_images = np.array(images)


100
found cached embeddings for Rotated-1_size-100_CLIP_RN50
found cached embeddings for Rotated-1_size-100_CyCLIP_RN50
Loading model from /Users/christina/Workspace/CLIP-explorer/clipexplorer/CLOOB_local/training/model_configs/RN50.json
found cached embeddings for Rotated-1_size-100_CLOOB_RN50
found cached embeddings for Rotated-1_size-100_CLOOB-LAION400M_ViT-B-16


  self.all_images = np.array(images)
  self.all_images = np.array(images)


100
found cached embeddings for Noisy-1_size-100_CLIP_RN50
found cached embeddings for Noisy-1_size-100_CyCLIP_RN50
Loading model from /Users/christina/Workspace/CLIP-explorer/clipexplorer/CLOOB_local/training/model_configs/RN50.json
found cached embeddings for Noisy-1_size-100_CLOOB_RN50
found cached embeddings for Noisy-1_size-100_CLOOB-LAION400M_ViT-B-16


KeyboardInterrupt: 

In [4]:
# full set of mscoco validation data (5000 samples)
dataset_mscoco_val_large = ce_data.MSCOCO_Val_Dataset(path='/Users/christina/Data/mscoco/validation/', batch_size=None) # TODO: update to a relative path
mscoco_val_images_large, mscoco_val_prompts_large, mscoco_val_dataset_large_name = get_data_helper(dataset_mscoco_val_large, filters=[], method=any)
mscoco_val_dataset_large_name

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
5000


'MSCOCO-Val_size-5000'

In [21]:
# create folder structure
export_directory = './exported_data_checkpoints/'

dataset_directory = create_dir_if_not_exists(export_directory + mscoco_val_dataset_large_name)


In [22]:
# export loss landscape of 5000 sample dataset
import numpy as np
import torch
from sklearn.decomposition import PCA
import json


for clip_model in ['CLIP', 'CyCLIP']:

    image_embedding, text_embedding, logit_scale = ce_utils.get_embedding(clip_model, mscoco_val_dataset_large_name, mscoco_val_images_large, mscoco_val_prompts_large)

    # loss difference
    modality_distance = ce_utils.get_modality_distance(image_embedding, text_embedding)
    loss = ce_utils.calculate_val_loss(image_embedding, text_embedding, logit_scale.exp())

    image_embedding_closed, text_embedding_closed = ce_utils.get_closed_modality_gap(image_embedding, text_embedding)
    modified_modality_distance = ce_utils.get_modality_distance(image_embedding_closed, text_embedding_closed)
    modified_loss = ce_utils.calculate_val_loss(image_embedding_closed, text_embedding_closed, logit_scale.exp())

    loss_landscape = {'original_distance': modality_distance, 'original_loss': loss, 'closed_distance': modified_modality_distance, 'closed_loss': modified_loss, 'loss_difference': modified_loss-loss}
    
    # compute loss landscape
    modality_gap = ce_utils.get_modality_gap_normed(image_embedding, text_embedding)
    
    distance_lst = []
    loss_lst = []
    for delta in np.arange(-5.0, 5.0, 0.25): 
        modified_text_features = ce_utils.l2_norm(text_embedding) + 0.5 * delta * modality_gap
        modified_text_features = ce_utils.l2_norm(modified_text_features)

        modified_image_features = ce_utils.l2_norm(image_embedding) - 0.5 * delta * modality_gap
        modified_image_features = ce_utils.l2_norm(modified_image_features)

        avg_val_loss = ce_utils.calculate_val_loss(modified_image_features, modified_text_features, logit_scale = logit_scale.exp())

        pca = PCA(n_components=6)
        pca.fit(np.concatenate((image_embedding, text_embedding), axis=0))

        gap_direction = ce_utils.get_gap_direction(modified_image_features, modified_text_features, pca)

        loss_lst.append(avg_val_loss)

        # Euclidean distance between mass centers
        distance_lst.append(
            ce_utils.get_modality_distance(modified_image_features, modified_text_features) * gap_direction
        )

    loss_landscape['distances'] = distance_lst
    loss_landscape['losses'] = loss_lst

    with open("%s/%s_loss_landscape.json"%(dataset_directory, clip_model), "w") as file:
        json.dump(loss_landscape, file)



found cached embeddings for MSCOCO-Val_size-5000_CLIP_RN50
found cached embeddings for MSCOCO-Val_size-5000_CyCLIP_RN50
