## load model and data

In [None]:
!pip install -q datasets
!pip install -q transformers
!pip install -q umap-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset

dataset = load_dataset("recastai/coyo-75k-augmented-captions")
sentences = dataset['train']['llm_caption']
sentences = [sentence[0] for sentence in sentences]

Downloading data:   0%|          | 0.00/23.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/337k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/73480 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/935 [00:00<?, ? examples/s]

In [None]:
import pandas as pd
import torch
from transformers import CLIPProcessor, CLIPModel

device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

## create and save embeddings

In [None]:
len(sentences)

73480

In [None]:
from tqdm import tqdm
import requests
from PIL import Image, UnidentifiedImageError
from io import BytesIO
import numpy as np

In [None]:
def get_text_embeddings(data):
    error_count = 0
    error_urls = []
    text_embeddings = []
    coutn=0
    data_text=data['llm_caption']
    data_url=data['url']

    dataframe=pd.DataFrame({'url':data_url,'text':data_text})
    for _,row in tqdm(dataframe.iterrows(),total=1000,desc="Processing Image/Text Pair"):
            image_url = row['url']
            text=row['text'][0]
            try:
                coutn+=1

                if coutn ==1000:
                    break
                response = requests.get(image_url)
                response.raise_for_status()  # Raise an exception for non-2xx status codes
                inputs=processor(text=text,return_tensors="pt",padding=True,truncation=True).to(device)
                with torch.no_grad():
                    batch_feature = model.get_text_features(**inputs)
                text_embeddings.append(batch_feature.to("cpu"))

            except (requests.exceptions.RequestException, UnidentifiedImageError, ValueError) as e:
              continue

    return text_embeddings,dataframe


In [None]:
text_embeddings,dataframe=get_text_embeddings(dataset['train'])

Processing Image/Text Pair: 100%|█████████▉| 999/1000 [12:00<00:00,  1.39it/s]


In [None]:
text_embeddings = torch.cat(text_embeddings,dim=0)
text_embeddings_np = text_embeddings.numpy()

In [None]:
text_embeddings_np.shape

(879, 512)

In [None]:
np.save("text_embeddings_train1k",text_embeddings_np)

#Dimension Reduction

In [None]:
from sklearn.decomposition import PCA
import umap
from sklearn.manifold import TSNE

def apply_pca(image_embeddings,n_components=3):
    pca=PCA(n_components=n_components)
    return pca.fit_transform(image_embeddings)

def apply_umap(image_embeddings,n_components=3):
    import umap
    umap=umap.UMAP(n_components=n_components)
    return umap.fit_transform(image_embeddings)

def apply_tsne(image_embeddings,n_components=3):
    tsne=TSNE(n_components=n_components)
    return tsne.fit_transform(image_embeddings)

In [None]:
dim_reduction="T-SNE"
clustering_algo="KMeans"
n_cluster=5

if dim_reduction == "PCA":
        reduced_embeddings = apply_pca(text_embeddings)
elif dim_reduction == "UMAP":
        reduced_embeddings = apply_umap(text_embeddings)
elif dim_reduction == "T-SNE":
        text_embeddings=np.array(text_embeddings) #converting to numpy array for T-SNE
        reduced_embeddings = apply_tsne(text_embeddings)

In [None]:
reduced_embeddings.shape

(879, 3)

In [None]:
np.save("t-sne_text_embeddings",reduced_embeddings)