In [1]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document

import json
import os

In [2]:
embedding_function = OpenAIEmbeddings(model="text-embedding-3-small")

In [3]:
# load data from json (from scraper)
with open('./mnemonicscraper/mnemonicscraper/output.json') as json_file:
    data = json.load(json_file)

In [4]:
# Create documents suitable for retrieval

page_content = []
metadatas = []

for i in data:
    content = f'{i["title"]} \n\n {i["ingress"]} \n\n {i["content"]} \n\n {i["url"]}'
    metadata = {
        "title": i["title"],
        "source": i["url"],
        "category": i["category"],
        "subcategory": i["subcategory"],
    }
    page_content.append(content)
    metadatas.append(metadata)
    

In [5]:
# Prepare for embedding and indexing
docs = [Document(page_content=content, metadata=metadata) for content, metadata in zip(page_content, metadatas)]

In [9]:
url = "https://63c46998-a66f-476a-92b0-39675fe642cc.us-east4-0.gcp.cloud.qdrant.io:6333"
api_key = os.environ['QDRANT_MNEMONIC']
qdrant = Qdrant.from_documents(
    docs,
    embedding_function,
    url=url,
    prefer_grpc=True,
    collection_name="mnemonic-io",
    force_recreate=True,
    api_key = api_key
)