### imports

In [33]:
import os
import pandas as pd
from transformers import ViTImageProcessor, ViTModel, ViTImageProcessorFast
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.utils.data_loaders import ImageLoader

### Workbook to demonstrate loading data to a ChromaDB

## Get the sample data

In [37]:
sample_data = pd.read_csv('/Users/shogun/code/gwen-m97/inspiart/raw_data/data_sampling1000_topstyles10.csv')

In [None]:
sample_data

Unnamed: 0,artist,style,movement,tags,url,img,file_name,genre_list,Artwork,Date
0,Panayiotis Tetsis,Impressionism,Post-Impressionism,,https://www.wikiart.org/en/panayiotis-tetsis/s...,https://uploads3.wikiart.org/images/panayiotis...,127519-sunset.jpg,['cloudscape'],,
1,Henry Scott Tuke,Impressionism,Impressionism,"['Fun', 'Vacation', 'summer']",https://www.wikiart.org/en/henry-scott-tuke/th...,https://uploads5.wikiart.org/00261/images/henr...,100529-henry-scott-tuke-the-bathers-1922.jpg,['genre painting'],,
2,Alfred Sisley,Impressionism,Impressionism,"['fires-and-floods', 'forests-and-trees', 'Ban...",https://www.wikiart.org/en/alfred-sisley/the-f...,https://uploads1.wikiart.org/images/alfred-sis...,92312-the-flood-on-the-road-to-saint-germain-1...,['landscape'],,
3,Camille Pissarro,Impressionism,Impressionism,"['Grassland', 'Meadow', 'Pasture', 'Plain']",https://www.wikiart.org/en/camille-pissarro/ro...,https://uploads2.wikiart.org/images/camille-pi...,90106-rolling-landscape-in-winter-1875.jpg,['landscape'],,
4,Guy Rose,Impressionism,Impressionism,"['seas-and-oceans', 'cliffs-and-rocks', 'Bank'...",https://www.wikiart.org/en/guy-rose/grey-after...,https://uploads3.wikiart.org/images/guy-rose/g...,104411-grey-afternoon.jpg,['landscape'],,
...,...,...,...,...,...,...,...,...,...,...
995,Le Corbusier,Cubism,Cubism,,https://www.wikiart.org/en/le-corbusier/nature...,https://uploads8.wikiart.org/images/le-corbusi...,155755-nature-morte-v-zelay-1939.jpg,['still life'],Nature morte Vézelay,1939
996,André Lhote,Cubism,Cubism,,https://www.wikiart.org/en/andr-lhote/nature-m...,https://uploads0.wikiart.org/images/andr-lhote...,155221-nature-morte-au-chinois-1930.jpg,['still life'],Nature Morte au Chinois,1930
997,Auguste Herbin,Cubism,Post-Impressionism,,https://www.wikiart.org/en/auguste-herbin/dess...,https://uploads6.wikiart.org/images/auguste-he...,124673-dessert-1913.jpg,['still life'],Dessert,1913
998,Arshile Gorky,Cubism,Surrealism,,https://www.wikiart.org/en/arshile-gorky/image...,https://uploads0.wikiart.org/images/arshile-go...,184386-image-in-khorkom.jpg,['abstract'],Image in Khorkom,1934-1936


In [34]:
meta_data = sample_data.columns

In [10]:
sample_data.isnull().sum()

artist          0
style           0
movement        0
tags          275
url             0
img             0
file_name       0
genre_list      0
Artwork       873
Date          873
dtype: int64

In [35]:
class GoogleVITHuge224Embedding(EmbeddingFunction):

    '''
    A class to provide custom embeddings to a ChromaDB database
    embedding images using the Google vit-huge-patch14-224-in21k
    the class returns an embedding as a numpy array
    '''


    def __call__(self, input: Documents) -> Embeddings:

        #Instantiate the image. Convert it to 244 x 244 and normalise RGB between 0 and 1 witha mean of 0.5 for each channel

        self.feature_extractor = ViTImageProcessorFast.from_pretrained('google/vit-huge-patch14-224-in21k')

        #Instantiate the Google ViT with pretrained weights

        self.model = ViTModel.from_pretrained('google/vit-huge-patch14-224-in21k')#Preprocess the data

        inputs = self.feature_extractor(images=input, return_tensors="pt")

        #Embedd the data

        outputs = self.model(**inputs)

        #Convert the embedding to a Numpy array and take the first vector of the Transformer state

        embeddings = outputs.last_hidden_state.data.numpy()[0,0]

        #return the embedding

        return embeddings

In [36]:
image_folder = '/Users/shogun/code/gwen-m97/inspiart/raw_data/sample1000'

images = [img for img in os.listdir(image_folder) if img.endswith('.jpg')]

image_loader = ImageLoader()

image_embbeding_function = GoogleVITHuge224Embedding()

chroma_client = chromadb.PersistentClient(path='/models/google_vit_sample1000_db')

images_db = chroma_client.get_or_create_collection(name="google_vit_sample1000_collection", embedding_function=image_embbeding_function, data_loader=image_loader)

print("START")

for image in images:

    image_path = os.path.join(image_folder, image)

    print(image_path)

    image_pil = Image.open(image_path)

    meta_data_row = sample_data.query("file_name == {image}")

    metadata=[{'image_path' : image_path,
                    'artist': meta_data_row['artist'],
                    'style' : meta_data_row['style'],
                    'movement' : meta_data_row['movement'],
                    'url': meta_data_row['url'],
                    'img' : meta_data_row['img'],
                    'file_name' : meta_data_row['file_name'],
                    'genre_list' : meta_data_row['genre_list']}]

    images_db.add(
        ids = [image],
        uris = [image_path],
        metadatas=[metadata]
    )

print("FINISH")

  image_embbeding_function = GoogleVITHuge224Embedding()


ValueError: Could not connect to tenant default_tenant. Are you sure it exists?