In [None]:
import os
from dotenv import load_dotenv
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import (
    StorageContext,
    VectorStoreIndex,
    Settings,
    Document
)
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
from dataclasses import dataclass

In [None]:
load_dotenv()

In [None]:
# Settings.embed_model = CohereEmbedding(
#         cohere_api_key=os.getenv("COHERE_API_KEY"),
#         model_name="embed-english-v3.0",
#         input_type="search_document",
#     )

In [None]:
Settings.embed_model = OpenAIEmbedding(
    api_key=os.getenv("OPENAI_API_KEY"), model="text-embedding-3-small", dimensions=1024
)

In [None]:
# Create the sentence window node parser w/ default settings
# Code straight from llamaindex docs
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
    include_metadata=True,
)

In [None]:
# Load data
data = pd.read_parquet("./data/parsed-dogmatics/dogmatics-text-w-meta-no-editors.parquet")

In [None]:
data.head()

In [None]:
# Change \n to a space in text column
data["text"] = data["text"].str.replace("\n", " ")

In [None]:
@dataclass()
class Paragraph:
    text: str
    excursus: bool
    title: str
    group: str
    volume: str
    date: str
    biblScope: str

In [None]:
Paragraphs = [Paragraph(*row) for row in data.itertuples(index=False)]

In [None]:
Paragraphs[0]


In [None]:
# Create llamaindex documents out of paragraphs, with metadata
docs = [
    Document(
        text=p.text,
        metadata={
            "excursus": p.excursus,
            "title": p.title,
            "group": p.group,
            "volume": p.volume,
            "date": p.date,
            "biblScope": p.biblScope,
        },
    )
    for p in Paragraphs if p.text is not None
]

In [None]:
# Check the length of paragraphs vs docs
len(Paragraphs), len(docs)

In [None]:
# Look into paragraphs with no text
# Given the POC nature of this particular work, 
# we'll ignore it for now and look in depth later
[p for p in Paragraphs if p.text is None]

In [None]:
# Take a look at a single doc to check if well-formed
docs[0]

In [None]:
# Create nodes with the SentenceWindowSplitter
nodes = node_parser.get_nodes_from_documents(docs, show_progress=True)

In [None]:
nodes[0]

In [None]:
# Create the Pinecone index
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
        name="dogmatics",
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
    )

In [None]:
pinecone_index = pc.Index("dogmatics")
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True)