# Prep model and environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m645.8 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.3
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-of0m9ch1
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-of0m9ch1
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369497 sha256=16cca2fc95f494a94bc1da887ea3d2d0a5a3c3f0304c3f31bcd40efe69cb6b3a
  Stored in directory: /tmp/pip-ephem-wheel-cache-5ox5snor/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac1

In [2]:
import numpy as np
import torch
import clip
import os
import glob
import pickle
import random
import concurrent.futures
from tqdm import tqdm
from pkg_resources import packaging
from PIL import Image

#### lol requirements checking

In [5]:
!pip freeze > all_packages.txt

In [7]:
import subprocess

# Your package list
my_packages = ['ftfy', 'regex', 'tqdm', 'numpy', 'torch', 'Pillow', 'setuptools']

# Function to get installed packages with versions
def get_installed_packages():
    result = subprocess.run(['pip', 'freeze'], stdout=subprocess.PIPE)
    installed_packages = result.stdout.decode('utf-8')
    return dict(line.split('==') for line in installed_packages.strip().split('\n'))

# Reading installed packages and their versions
installed_packages = get_installed_packages()

# Filtering based on your package list
filtered_packages = {pkg: ver for pkg, ver in installed_packages.items() if pkg in my_packages}

# Read existing requirements.txt and convert to dictionary
existing_requirements = {}
try:
    with open('requirements.txt', 'r') as file:
        for line in file:
            if '==' in line:
                pkg, ver = line.strip().split('==')
                existing_requirements[pkg] = ver
except FileNotFoundError:
    print("requirements.txt not found, creating a new one.")

# Merge and update requirements
updated_requirements = {**existing_requirements, **filtered_packages}

# Writing to requirements.txt
with open('requirements.txt', 'w') as req_file:
    for pkg, ver in updated_requirements.items():
        req_file.write(f'{pkg}=={ver}\n')




### Load model



In [None]:
print("Torch version:", torch.__version__)

model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)


In [None]:
#clip has different models
clip.available_models()

In [None]:
preprocess

# Text Embedding

## Example Text Tokenize

In [None]:
  clip.tokenize("ab")

In [None]:
with open('/content/drive/MyDrive/MANIFOLD NETS/sowpods.txt', 'r') as file:
    words = [line.strip() for line in file]

print(words[77764],words[84712])

## Tokenize and embed Scrabble Text

In [None]:
# Load the CLIP model, ensure it's on the same device as your data
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def tokenize_words(file_path):
    with open(file_path, 'r') as file:
        # Wrap file lines in tqdm for a progress bar
        lines = [line.strip() for line in tqdm(file, desc="Tokenizing words")]
    return lines

# Replace 'your_file.txt' with the path to your file
file_path = '/content/drive/MyDrive/AI DOCKER/sowpods.txt'
text_as_tokens = tokenize_words(file_path)

In [None]:
# Process in batches
batch_size = 10000
text_features = []

for i in tqdm(range(0, len(text_as_tokens), batch_size), desc="Processing"):
    batch_texts = text_as_tokens[i:i+batch_size]
    text_tokens = clip.tokenize(batch_texts).to(device)

    with torch.no_grad():
        batch_features = model.encode_text(text_tokens).float()
        text_features.append(batch_features)

# Concatenate all batch features
text_features = torch.cat(text_features, dim=0)

### Saving token and embeds as pkl for later use

In [None]:
file_path-1 = '/content/drive/MyDrive/MANIFOLD NETS/CLIP/text_as_tokens.pkl'
file_path-2 = '/content/drive/MyDrive/MANIFOLD NETS/CLIP/text_features.pkl'

with open(file_path-1, 'wb') as file:
    pickle.dump(text_as_tokens, file)
with open(file_path-2, 'wb') as file:
    pickle.dump(text_features, file)

# Image Embedding

Takes the images stored in gdrive and batches the embedding process, and saves a pickle of the resulting tensor vector and the sorted index with the image folder paths back to drive

In [None]:
# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Function to read and preprocess a single image
def read_and_preprocess_image(image_file):
    try:
        image = preprocess(Image.open(image_file)).unsqueeze(0).to(device)
        return image
    except Exception as e:
        print(f"Error processing {image_file}: {e}")
        return None

# Function to batch process images using threading
def process_images(folder_paths):
    image_files = []
    source_info = []  # List to keep track of full image paths and source folders for each image
    for folder_path in folder_paths:
        folder_image_files = glob.glob(os.path.join(folder_path, '*'))[:5]
        image_files.extend(folder_image_files)
        source_info.extend([(folder_path, image_file) for image_file in folder_image_files])

    images = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for image in executor.map(read_and_preprocess_image, image_files):
            if image is not None:
                images.append(image)

    if images:
        images = torch.cat(images, dim=0)
        with torch.no_grad():
            image_features = model.encode_image(images).float()
        return image_features, source_info
    else:
        return torch.Tensor(), []

# Function to save all embeddings to a single .pkl file
def save_all_embeddings_to_pkl(embeddings, save_dir, filename):
    os.makedirs(save_dir, exist_ok=True)
    with open(os.path.join(save_dir, filename), 'wb') as file:
        pickle.dump(embeddings, file)

# Path to the main directory and the directory to save embeddings
main_dir = "drive/MyDrive/MANIFOLD NETS/imagenet_sample"
save_dir = "drive/MyDrive/MANIFOLD NETS/image_embeddings"

# Get all subfolders in the main directory
subfolders = [os.path.join(main_dir, f) for f in os.listdir(main_dir) if os.path.isdir(os.path.join(main_dir, f))]
subfolder_batches = [subfolders[i:i + 5] for i in range(0, len(subfolders), 5)]



### Embedding loop

In [None]:
all_image_features = []
image_source_index = []  # Global index for mapping embeddings to source folders and full image paths

for folder_batch in tqdm(subfolder_batches, desc="Processing Folders"):
    batch_features, source_info = process_images(folder_batch)
    if batch_features.nelement() == 0:
        continue
    all_image_features.append(batch_features)
    image_source_index.extend(source_info)

# Save all embeddings and the source index
concatenated_features = torch.cat(all_image_features, dim=0)
save_all_embeddings_to_pkl(concatenated_features, save_dir, 'all_image_embeddings.pkl')
save_all_embeddings_to_pkl(image_source_index, save_dir, 'image_source_index.pkl')

## Testing if index of embeddings matches

In [None]:
#index test 2: clip embeddings are not exact!


# Load combined embeddings and image source index
with open('drive/MyDrive/MANIFOLD NETS/image_embeddings/all_image_embeddings.pkl', 'rb') as f:
    combined_embeddings = pickle.load(f)

with open('drive/MyDrive/MANIFOLD NETS/image_embeddings/image_source_index.pkl', 'rb') as f:
    image_source_index = pickle.load(f)

# Function to preprocess and embed an image using CLIP
def embed_image(image_path, preprocess, model, device):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image).float()
    return image_features

# Function to compare embeddings with a custom threshold
def compare_embeddings(original_embedding, test_embedding, threshold=1e-3):
    return torch.isclose(original_embedding, test_embedding, atol=threshold).all().item()

# Ensure model is in evaluation mode
model.eval()
# Indices to test: choose a few random indices
num_tests = 5  # Number of tests to perform
indices_to_test = random.sample(range(len(combined_embeddings)), num_tests)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Testing
match_threshold = 0.1
results = []
for index in indices_to_test:
    image_path = image_source_index[index][1]  # Get the full image path
    test_embedding = embed_image(image_path, preprocess, model, device)
    original_embedding = combined_embeddings[index]
    match = compare_embeddings(original_embedding, test_embedding, threshold=match_threshold)
    results.append((index, image_path, original_embedding.cpu().numpy(), test_embedding.cpu().numpy(), match))

# Additional diagnostic information
for result in results:
    print(f"Index: {result[0]}, Image Path: {result[1]}")
    print("Original Embedding (First 5 values):", result[2][:5].tolist())  # Direct slicing and conversion
    print("Test Embedding (First 5 values):", result[3][0][:5].tolist())     # Direct slicing and conversion
    print("Match:", result[4], "\n")
