Start your qdrant server first (or use the free qdrant SAAS tier).

In [2]:
# Initialize clients
from openai import OpenAI
openAIclient = OpenAI()

# Setup a local qdrant DB
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

# qdrant = QdrantClient(path="../qdrant")  # Persists changes to disk (no qdrant UI though)

qdrant = QdrantClient("http://localhost:6333") # Connect to existing Qdrant instance


In [None]:
from qdrant_client.models import PointStruct
# Check if collection exists and create it if it doesn't
if qdrant.collection_exists(collection_name="GendergerechteSprache"):
    qdrant.delete_collection(collection_name="GendergerechteSprache")
qdrant.create_collection(
    collection_name="GendergerechteSprache", # Cluster centroids
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE))

demo_sentences = ["Gesucht: IT-Berater:in",
                  "Gesucht: IT-Berater",
                  "Gesucht: IT-Consultant",
                  "Gesucht: IT-Beraterin",
                  "Gesucht: IT-Berater*in",
                  "Gesucht: IT-Beratende",
                  "Gesucht: Bademeister",
                  "Gesucht: Bademeisterin",
                  "Gesucht: Bademeister*in",
                  "Gesucht: Bademeister:in",
                  "Gesucht: Hufschmied",
                  "Gesucht: Hufschmiedin",
                  "Gesucht: Hufschmied:in",
                  "Gesucht: Hufschmied*in"
                  ] 

# Here, we assume that the OpenAI API key is set in the environment variable OPENAI_API_KEY

for i, sentence in enumerate(demo_sentences, start=200):
    response = openAIclient.embeddings.create(
        model="text-embedding-3-small",
        input=sentence,
        encoding_format="float"
    )
    emb = response.data[0].embedding
    
    # Add the sentence and its embedding to qdrant

    qdrant.upsert(
    collection_name="GendergerechteSprache",
        points=[
            PointStruct(
                id=i,
                vector=emb,
                payload={"sentence": sentence}
            )
        ]
    )

qdrant.count("GendergerechteSprache")

Take a look at http://localhost:6333/dashboard

Wit a visualisation config such as this one:
```
{
  "limit": 500,
  "algorithm": "UMAP"
}
```
IT-Berater <- 0.962 -> IT-Berater:in 

In [1]:
# Query qdrant with the sentence "Gesucht: IT-Berater", print the results and their cosine similarity
# Get embedding for the query sentence
query_sentence = "Gesucht: IT-Berater"
response = openAIclient.embeddings.create(
    model="text-embedding-3-small",
    input=query_sentence,
    encoding_format="float"
)
query_vector = response.data[0].embedding

# Search for similar sentences in the collection
search_results = qdrant.search(
    collection_name="GendergerechteSprache",
    query_vector=query_vector,
    limit=10  # Return top 10 results
)

# Print the results with their cosine similarity scores
print(f"Query: '{query_sentence}'\n")
print("Results (ordered by similarity):")
print("-" * 50)
for result in search_results:
    # Cosine similarity score (higher is more similar)
    similarity = 1 - result.score  # Converting from distance to similarity
    percentage = similarity * 100
    
    print(f"• {result.payload['sentence']}")
    print(f"  Similarity: {similarity:.4f} ({percentage:.2f}%)")
    print()

NameError: name 'openAIclient' is not defined