In [1]:
from pymongo import MongoClient
from pathlib import Path
import pandas as pd
import numpy as np
import textwrap

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain.docstore.document import Document

from bson import ObjectId

---

## as always mongodb collections

In [None]:
def set_up_mongo(client_str,database_str,collection_str):
    client = MongoClient(client_str) #connect to mongodb client
    db = client[database_str] #connect to database

    existing_collections = db.list_collection_names() #check that dbs collections
    if collection_str not in existing_collections:
        db.create_collection(collection_str) #create collection if needed
    
    my_collection = db[collection_str] #connect to collection

    return my_collection

## creating nice documents for insertion funciton

In [None]:
def create_doc(list_of_data, list_of_keys,vdb):
    for e in list_of_data:
        text = ""
        id = str(e["_id"])
        for k in list_of_keys:
            text += str(e.get(k))
        doc = Document(page_content=text, metadata={"mongo_id":id})
        vdb.add_documents(documents=[doc])


## create persistent vector store

In [None]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') 
vectordb = Chroma(
    collection_name="nlp_search",
    persist_directory="./nlp_search",
    embedding_function=embeddings
)

---

## insert all documents now

In [None]:
c1 = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','osm_pois') 
data = [e for e in c1.find({},{"_id":1,"name":1,"amenity":1})]
text_type = ["name","amenity"]
create_doc(data,text_type,vectordb)

In [None]:
c2 = set_up_mongo('mongodb://localhost:27017','sh_data_collection','yp_kiel') 
data = [e for e in c2.find({},{"_id":1,"name":1,"keywords":1,"description":1})]
text_type = ["name","keywords","description"]
create_doc(data,text_type,vectordb)

In [None]:
c3 = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','avg_rent') 
data = [e for e in c3.find({},{"_id":1,"Stadt":1,"Landkreis":1,"Bundesland":1,"avg_rent":1})]
text_type = ["Stadt","Landkreis","Bundesland","avg_rent"]
create_doc(data,text_type,vectordb)

In [None]:
c4 = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','event_data') 
data = [e for e in c4.find({},{"_id":1,"title":1,"categories":1,"texts":1})]
text_type = ["title","categories","texts"]
create_doc(data,text_type,vectordb)

In [None]:
c5 = set_up_mongo('mongodb://localhost:27017','webscraping_dataLabKiel','digitized_planet_v2') 
data = [e for e in c5.find({},{"_id":1,"name":1})]
text_type = ["name"]
create_doc(data,text_type,vectordb)

---

In [None]:
vectordb._collection.count()

## connect to persistent and retrieve

In [7]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') 
vectordb = Chroma(
    collection_name="nlp_search",
    persist_directory="./nlp_search",
    embedding_function=embeddings
)

In [8]:
vectordb._collection.count()

99210

In [9]:
retriever = vectordb.as_retriever(search_kwargs=dict(k=20))

In [10]:
results = retriever.invoke("Ich will mir morgen die haare schneiden lassen welche friseure haben auf")

AttributeError: 'PersistentData' object has no attribute 'max_seq_id'

In [None]:
results

In [None]:
vectordb._collection.count()

In [None]:
batch_ids = [ObjectId(r.metadata["mongo_id"]) for r in results]

In [None]:
#matches = [e for e in mongodbcollection.find({"_id":{"$in": batch_ids}})]