In [13]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)

In [14]:
index_name = 'secret-hitler-strategy'

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072, # Replace with your model dimensions
        metric="cosine", # Replace with your model metric
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

index = pc.Index(index_name)

In [15]:
# from mdsplit import PathBasedSplitter

# splitter = PathBasedSplitter('./tartanllama.md', 'utf8', 5, True, './out', False, False)
# splitter.process()

In [16]:
from openai import OpenAI
client = OpenAI()

from pathlib import Path

for dir_path, dirs, files in os.walk('./out'):
    for file_name in files:
        if not Path(file_name).suffix == ".md": continue
        file_path = Path(dir_path) / file_name

        with open (file_path, 'r') as f:
            filetext = f.readlines()
            vectors = []

            for line in filetext:
                if line in ('\n', ' ', '\t'):
                    continue

                response = client.embeddings.create(
                    model="text-embedding-3-large",
                    input=line,
                    encoding_format="float"
                )

                vectors.append({
                    'metadata': { 'text': line },
                    'id': str(file_path),
                    'values': response.data[0].embedding,
                })

            if len(vectors) > 0:
                index.upsert(vectors=vectors, namespace='tartanllama')