In [41]:
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv()


True

In [None]:
from llama_index.core import SimpleDirectoryReader

text_directory = r"./kwikly_scraper/output/"
reader = SimpleDirectoryReader(input_dir=text_directory, recursive=True)
documents_list = reader.load_data(show_progress=True)

Loading files: 100%|██████████| 124/124 [00:00<00:00, 425.29file/s]


In [75]:
for doc in docs[:5]:
    print(doc)

Doc ID: 44d91c52-0602-4929-a4c1-971803c52ea5
Text: ---  url: https://joinkwikly.com/5-tips-for-choosing-the-best-
dental-temp-agency/  title: 5 Tips for Choosing the Best Dental Temp
Agency | A Kwikly Guide  date_scraped: Thu, 21 Nov 2024 02:04:34 GMT
---  * Home » 5 Tips for Choosing the Best Dental Temp Agency
Doc ID: 2c7c0751-afd8-486c-9d91-ecae9ef676f3
Text: 5 Tips for Choosing the Best Dental Temp Agency | A Kwikly Guide
When you’re a dental practice manager and get a text notification at 5
a.m., you know what it means before you even look at your phone.
Someone is calling in sick, and suddenly you’re scrambling. Until now,
you had three options:    * Spend the first hour of your workday
calling pa...
Doc ID: 5d1361f7-7a02-4ce3-8802-2fc24b2d0ed1
Text: Kinds of Temp Dental Agencies There are two common categories of
dental temping agencies:    * Traditional dental staffing agencies
* Online temping platforms  Traditional dental staffing agencies are
often locally operated with conn

In [19]:
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import DatasetGenerator

gpt_4o_mini = OpenAI(model='gpt-4o-mini', temperature=0.5,
                     system_prompt="You are a helpful bot that answers questions for dental hygienists looking for temp work.")

In [20]:
data_generator = DatasetGenerator.from_documents(documents=documents_list,
                                                 num_questions_per_chunk=1,
                                                 llm=gpt_4o_mini,
                                                 show_progress=True)

eval_questions = data_generator.generate_questions_from_nodes(num=200)

Parsing nodes: 100%|██████████| 959/959 [00:00<00:00, 2935.39it/s]
  return cls(
100%|██████████| 200/200 [00:06<00:00, 29.72it/s] 
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [21]:
import random

random.sample(eval_questions, k=20)

['Discuss how economic trends and population shifts have influenced the evolution of dental employment in the USA, as mentioned in the article.',
 'What are the potential risks associated with classifying dental hygienists and assistants as independent contractors according to the document?',
 'What are the implications of classifying a temporary dental professional as an independent contractor versus an employee in the context of staffing challenges faced by dental practices?',
 'Who is the featured dental hygienist highlighted in the Kwikly community, and what is her professional designation?',
 'What are some reasons that make a career as a dental hygienist rewarding, according to the provided context?',
 'What new feature was introduced in the Kwikly app on May 21, 2023, and what are its primary functions for dental professionals and offices?',
 'What are some common tax considerations that dental professionals and practices should be aware of, as outlined in the document?',
 'What

In [22]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext

docstore = SimpleDocumentStore()
docstore.add_documents(documents_list)

storage_context = StorageContext.from_defaults(docstore=docstore)

storage_context.persist("./kwikly-docstore.json")

In [28]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

# Settings.embed_model = HuggingFaceEmbedding(model_name='pingkeest/learning2_model',
#                                             trust_remote_code=True,)

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = OpenAI(model='gpt-4o-mini', temperature=0.3,
                      system_prompt=""""You are an intelligent and supportive assistant specifically designed to help dental hygienists find temporary work and dental offices to discover available hygienists to meet their staffing needs. Your role is to provide accurate, helpful, and encouraging responses.

Welcome dental hygienists by understanding their skills and availability, helping them navigate job opportunities that suit their preferences.
Assist dental offices by helping them articulate their staffing needs and guiding them to potential candidates.
Use a friendly, professional tone to facilitate a welcoming environment where both parties feel valued and understood.
Ensure all interactions prioritize clarity, confidentiality, and professionalism.
Encourage users to ask questions and provide all the necessary information to ensure successful matches.
If possible, provide a link to your source so that the user knows you are telling the truth.""")


In [29]:
from llama_index.core import VectorStoreIndex, StorageContext, Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient

# make sure docker container is running before doing this
# run the docker container by doing:
# docker start qdrant_dev

client = QdrantClient(host="localhost", port=6333)

aclient = AsyncQdrantClient(host="localhost", port=6333)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="kwikly_support_agent",
    aclient=aclient,
    enable_hybrid=True
)

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 5007.53it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


In [30]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext, VectorStoreIndex, Settings

docstore = SimpleDocumentStore.from_persist_path(r"kwikly-docstore\docstore.json")
documents_list = [doc_item for doc_item in docstore.docs.values()]
storage_context = StorageContext.from_defaults(docstore=docstore, vector_store=vector_store)

Settings.chunk_size = 512
Settings.chunk_overlap = 64

index = VectorStoreIndex.from_documents(documents=documents_list,
                                        storage_context=storage_context,
                                        show_progress=True
                                        )


Parsing nodes: 100%|██████████| 959/959 [00:00<00:00, 2420.63it/s]
Generating embeddings: 100%|██████████| 989/989 [00:11<00:00, 83.90it/s] 


In [32]:
# Assuming your vector_store is a Qdrant instance
points = vector_store.client.scroll(
    collection_name=vector_store.collection_name,
    limit=10  # adjust this number as needed
)[0]

# Each point should have a 'vector' field containing the embedding
for point in points:
    print(f"ID: {point.id}")
    print(f"Vector (embedding): {point.vector}")  # This should be your ~1500 dimension vector

ID: 006a4217-7b27-4ce3-b46a-e6332f6d44c3
Vector (embedding): None
ID: 00f515a1-d9d9-412b-bb89-bed01838b30b
Vector (embedding): None
ID: 01da11d7-d7d4-4558-acde-5370bf226cb5
Vector (embedding): None
ID: 020e5c8c-b20a-4247-b9a0-80abff46a7db
Vector (embedding): None
ID: 0280698d-fe89-4cbf-8e93-c8097e9411ae
Vector (embedding): None
ID: 02b0299a-2410-496f-85c6-af39da6b1eca
Vector (embedding): None
ID: 02df3ea0-58b1-4c6e-bdd9-0a7ed84e7261
Vector (embedding): None
ID: 02e23e9a-c9b6-41bc-94b7-d8b4369b113a
Vector (embedding): None
ID: 02e7e6e1-dac3-41ba-93df-a782e5568770
Vector (embedding): None
ID: 02ed1665-ffbd-42cc-a589-596c1651f906
Vector (embedding): None


In [39]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
query_engine = index.as_query_engine(OpenAI(model='gpt-4o-mini', temperature=0))
response = query_engine.query("What day was 'title: Continuing Dental Education In The USA: Common Questions' scraped?")

In [43]:
nodes = vector_store.aget_nodes(node_ids=['006a4217-7b27-4ce3-b46a-e6332f6d44c3'])

In [52]:
points = vector_store.client.scroll(
    collection_name=vector_store.collection_name,
    limit=1,  # adjust this number as needed
    with_vectors=True,  # Make sure to include this!
    with_payload=True
)[0]

# Let's inspect what we actually have in the point object
for point in points:
    print(f"ID: {point.id}")
    print("Point attributes:", dir(point))  # This will show us all available attributes
    print("Point dict:", point.dict().get('vector').get('text-dense'))  # This will show us the entire structure
    print("---")

ID: 006a4217-7b27-4ce3-b46a-e6332f6d44c3
Point attributes: ['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_model__', '__pydantic_serializer__', '__pydantic_validator__', '__reduce__', '__reduce_ex__', '__repr__', 

In [None]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
index.storage_context.docstore.docs

{}

In [60]:
# documents_list[0].id_
[doc for doc in documents_list if doc.id_ == '5e06a693-0cf8-439e-858a-7ea84484ddde']

[Document(id_='5e06a693-0cf8-439e-858a-7ea84484ddde', embedding=None, metadata={'file_path': 'c:\\Users\\georg\\Documents\\kwikly-demo\\kwikly_scraper\\output\\resources\\page\\4.md', 'file_name': '4.md', 'file_size': 12426, 'creation_date': '2024-11-20', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text="\n\nFind Temporary Staff Seamlessly With Kwikly\nIf you’ve ever considered using a temporary dental staffing agency to address staffing shortages, you know how much problem-solving potential they have. But can temp agencies without intuitive temp apps really provide the solutions you need? If you've tried one, you may have used a platform with confusing naviga

# Simple setup to check

In [64]:
from qdrant_client import QdrantClient, models

# Initialize the Qdrant client
client = QdrantClient(host="localhost", port=6333)

# Define the embedding dimension
embedding_dimension = 4  # Adjust according to your actual embedding size

# Check if the collection exists; create if it doesn't
collection_name = "kwikly_support_agent"
if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=embedding_dimension,
            distance=models.Distance.COSINE
        )
    )

# Prepare the points correctly
points = [
    models.PointStruct(id="test-id1", vector=[0.1, 0.2, 0.3, 0.4])  # Ensure vector dimension matches
]

# Attempt to upload
try:
    client.upload_points(
        collection_name=collection_name,
        points=points
    )
    print("Upload success")
except Exception as e:
    print("Error during upload:", e)



Error during upload: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Format error in JSON body: value test-id1 is not a valid point ID, valid values are either an unsigned integer or a UUID"},"time":0.0}'


Trying PineCone instead

In [66]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
pc = Pinecone()
pc.create_index(
    name="quickstart",
    dimension=1536,
    metric="euclidean",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
pinecone_index = pc.Index("quickstart")

In [72]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore
from IPython.display import Markdown, display

In [73]:
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(documents=docs[0:10],
                                        storage_context=storage_context,
                                        show_progress=True)

Parsing nodes: 100%|██████████| 10/10 [00:00<00:00, 1999.00it/s]
Generating embeddings: 100%|██████████| 10/10 [00:15<00:00,  1.56s/it]
Upserted vectors:   0%|          | 0/10 [00:00<?, ?it/s]

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 21 Nov 2024 04:39:11 GMT', 'Content-Type': 'text/plain', 'Content-Length': '80', 'Connection': 'keep-alive', 'server': 'envoy'})
HTTP response body: Unexpected token.
16c669", "values": [NaN, NaN, NaN, NaN, 
                    ^
