# indexing chapters

## Index book chapters 

### Used libraries
- OpenAI: For creating embeddings from paragraphs within chapters
- Pinecone: Used to store and search by cosine-similarity across embeddings
- Retry: Used for retrying failures accessing APIs

In [4]:
import openai
import os
import pinecone
import re

from retry import retry

book_url = "https://raw.githubusercontent.com/ganesh47/data-samples/main/books/sheshadri-swamigal/chapter1.txt"
index_name = 'sheshadri-swamigal-v2'
embed_model = "text-embedding-ada-002"

def should_index(text):
    if len(text) >= 50:
        return  True
    return False

def download(url):
    from urllib.request import urlopen
    data = urlopen(url).read() 
    return data.decode('utf-8')

@retry(tries=10)
def crate_embedding(text):
    return openai.Embedding.create(input=text, engine=embed_model)

book_content = download(book_url)
paras = re.split("\n\n", book_content)
count = 0
for para in paras:
    count = count + len(para)

print("Likley cost for OpenAI embedding API $" + str(count / 4000 * 0.0004))
openai.api_key = os.environ["API"]

index = pinecone.Index(index_name)
total_tokens = 0
paraCount = 0
chapterCount=1
pinecone.init(
        api_key=os.environ["PC_API"],
        environment="us-east1-gcp"
    )
for para in paras:
    if not should_index(para):
        continue
    res = crate_embedding(para)
    paraCount = paraCount + 1
    embeds = [record['embedding'] for record in res['data']]
    total_tokens = res['usage']['total_tokens'] + total_tokens
    to_upsert = embeds
    if index_name not in pinecone.list_indexes():
        # if does not exist, create index
        pinecone.create_index(
            index_name,
            dimension=len(res['data'][0]['embedding']),
            metric='cosine',
            metadata_config={'indexed': ['text', 'chapter']}
        )
    index.upsert(vectors=list(zip(["chapter "+str(chapterCount)+" para" + str(paraCount)], embeds,[{'text':para,'chapter':str(chapterCount)}])))
print("Used total tokens " + str(total_tokens))     
print("OpenAI API usage costs $"+str((total_tokens * 0.0004)/1000))



Likley cost for OpenAI embedding API $0.0024460000000000003
Used total tokens 6668
OpenAI API usage costs $0.0026672
