In [18]:
import dotenv
import os

os.chdir('/home/lakiet/Projects/personal/chatbot/')

dotenv.load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY") 
openai_api_key = os.getenv("OPENAI_API_KEY") 


In [19]:
from pinecone import Pinecone, ServerlessSpec


pc = Pinecone(api_key=pinecone_api_key)


In [26]:
import json
from llama_index.core import Document

# Load the JSON data
with open('data/stories.json', 'r') as f:
    data = json.load(f)

# Create Document objects from the JSON data
documents = []
for item in data:
    doc = Document(
        text=item['story'],
        metadata={
            'author': item['author'],
            'title': item['title'],
            'story': item['story'],
        }
    )
    documents.append(doc)

In [25]:
documents[0].text

'Spread love everywhere you go: first of all in your own house. Give \nlove to your children, to your wife or husband, to a next door neighbor. . \n. . Let no one ever come to you without leaving better and happier. Be \nthe living expression of God\'s kindness; kindness in your face, kindness \nin your eyes, kindness in your smile, kindness in your warm greeting. Mother Teresa \n \nA college professor had his sociology class go into the Baltimore slums \nto get case histories of 200 young boys. They were asked to write an \nevaluation of each boy\'s future. In every case the students wrote, "He \nhasn\'t got a chance." Twenty-five years later another sociology \nprofessor came across the earlier study. He had his students follow up \non the project to see what had happened to these boys. With the \nexception of 20 boys who had moved away or died, the students learned \nthat 176 of the remaining 180 had achieved more than ordinary success \nas lawyers, doctors and businessmen. \nThe pr

In [23]:
from llama_index.embeddings.openai import OpenAIEmbedding

# Initialize the OpenAI embedding model
embedding_model = OpenAIEmbedding(model='text-embedding-3-small', api_key=openai_api_key)

# Generate embeddings for the documents
embeddings = [embedding_model.get_text_embedding(doc.text) for doc in documents]

In [28]:
# Create or connect to a Pinecone index
index_name = "chatbot1"
if index_name not in pc.list_indexes():
    pc.create_index(index_name, 
                    dimension=1536,
                    metric='cosine',    
                    spec=ServerlessSpec
                    (
                    cloud="aws",
                    region="us-east-1"
                    )
                    )  

index = pc.Index(index_name)

# Upload embeddings to Pinecone
for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
    index.upsert(vectors=[
        (f"doc-{i}", embedding, doc.metadata)
    ],
    namespace='ns1')