Start your qdrant server first (or use the free qdrant SAAS tier).

In [7]:
# Initialize clients


MODEL_ID = "gemini-embedding-001"

import vertexai
from vertexai.language_models import TextEmbeddingModel

vertexai.init(project="focused-beacon-460816-f6", location="us-central1")

model = TextEmbeddingModel.from_pretrained(MODEL_ID)
embeddings = model.get_embeddings(["Hello world! This is a test embedding."])

# Print the embedding object to see its structure
print(f"Type of embeddings: {type(embeddings)}")
print(f"Number of embeddings: {len(embeddings)}")

# Extract the first embedding's values and check its dimensionality
first_embedding = embeddings[0]
embedding_values = first_embedding.values
print(f"Embedding dimension: {len(embedding_values)}")

# Print a sample of the values (first 5 elements)
print(f"First 5 values: {embedding_values[:5]}")




Type of embeddings: <class 'list'>
Number of embeddings: 1
Embedding dimension: 3072
First 5 values: [-0.024923978373408318, 0.014082803390920162, -0.020037217065691948, -0.0819728896021843, 0.011573716066777706]


In [8]:
# Setup a local qdrant DB
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

# qdrant = QdrantClient(path="../qdrant")  # Persists changes to disk (no qdrant UI though)

qdrant = QdrantClient("http://localhost:6333") # Connect to existing Qdrant instance

In [9]:
from qdrant_client.models import PointStruct
# Define collection name as a variable
collection_name = "GendergerechteSprache-Gemini-Embedding-001"

# Check if collection exists and create it if it doesn't
if qdrant.collection_exists(collection_name=collection_name):
    qdrant.delete_collection(collection_name=collection_name)
qdrant.create_collection(
    collection_name=collection_name, # Cluster centroids
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE))  # Gemini embedding size is 768

demo_sentences = ["Gesucht: IT-Berater:in",
                  "Gesucht: IT-Berater",
                  "Gesucht: IT-Consultant",
                  "Gesucht: IT-Beraterin",
                  "Gesucht: IT-Berater*in",
                  "Gesucht: IT-Beratende",
                  "Gesucht: Bademeister",
                  "Gesucht: Bademeisterin",
                  "Gesucht: Bademeister*in",
                  "Gesucht: Bademeister:in",
                  "Gesucht: Hufschmied",
                  "Gesucht: Hufschmiedin",
                  "Gesucht: Hufschmied:in",
                  "Gesucht: Hufschmied*in"
                  ] 

# Use Google Vertex AI (Gemini) for embeddings
for i, sentence in enumerate(demo_sentences, start=200):
    # Get embedding using Vertex AI Gemini model
    embeddings = model.get_embeddings([sentence])
    emb = embeddings[0].values
    
    # Add the sentence and its embedding to qdrant
    qdrant.upsert(
    collection_name=collection_name,
        points=[
            PointStruct(
                id=i,
                vector=emb,
                payload={"sentence": sentence}
            )
        ]
    )

qdrant.count(collection_name)

CountResult(count=14)

Take a look at http://localhost:6333/dashboard

Wit a visualisation config such as this one:
```
{
  "limit": 500,
  "algorithm": "UMAP"
}
```

In [10]:
# Query qdrant with the sentence "Gesucht: IT-Berater", print the results and their cosine similarity
# Get embedding for the query sentence using Vertex AI Gemini
query_sentence = "Gesucht: IT-Berater"
embeddings = model.get_embeddings([query_sentence])
query_vector = embeddings[0].values

# Search for similar sentences in the collection
search_results = qdrant.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=10  # Return top 10 results
)

# Print the results with their cosine similarity scores
print(f"Query: '{query_sentence}'\n")
print("Results (ordered by similarity):")
print("-" * 50)
for result in search_results:
    # Cosine similarity score (higher is more similar)
    similarity = 1 - result.score  # Converting from distance to similarity
    percentage = similarity * 100
    
    print(f"• {result.payload['sentence']}")
    print(f"  Similarity: {similarity:.4f} ({percentage:.2f}%)")
    print()

Query: 'Gesucht: IT-Berater'

Results (ordered by similarity):
--------------------------------------------------
• Gesucht: IT-Berater
  Similarity: 0.0000 (0.00%)

• Gesucht: IT-Berater:in
  Similarity: 0.0386 (3.86%)

• Gesucht: IT-Berater*in
  Similarity: 0.0392 (3.92%)

• Gesucht: IT-Beratende
  Similarity: 0.0482 (4.82%)

• Gesucht: IT-Beraterin
  Similarity: 0.0503 (5.03%)

• Gesucht: IT-Consultant
  Similarity: 0.1068 (10.68%)

• Gesucht: Bademeister
  Similarity: 0.3134 (31.34%)

• Gesucht: Bademeister:in
  Similarity: 0.3151 (31.51%)

• Gesucht: Bademeister*in
  Similarity: 0.3191 (31.91%)

• Gesucht: Hufschmied:in
  Similarity: 0.3229 (32.29%)

