In [18]:
from langchain.document_loaders import *

loader = DirectoryLoader('pineconedocs')
docs = loader.load()

In [19]:
docs[50].metadata['source'].replace('pineconedocs\\', 'https://')

'https://docs.pinecone.io\\docs\\release-notes.html'

In [20]:
import tiktoken

tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [22]:
from uuid import uuid4
from tqdm.auto import tqdm

chunks = []

for idx, record in enumerate(tqdm(docs)):
    texts = text_splitter.split_text(record.page_content)
    chunks.extend([{
        'id': str(uuid4()),
        'text': texts[i],
        'chunk': i,
        'url': record.metadata['source'].replace('pineconedocs\\', 'https://')
    } for i in range(len(texts))])

100%|██████████| 138/138 [00:00<00:00, 382.18it/s]


In [24]:
import openai
import os

# initialize openai API key
openai.api_key = os.getenv("OPENAI_API_KEY_FREE")  #platform.openai.com
embed_model = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=embed_model
)

In [25]:
import pinecone

index_name = 'gpt-3-5-turbo-pinecone-docs'

# initialize connection to pinecone
pinecone.init(
    api_key=os.getenv("pinecone_api_key"),  # app.pinecone.io (console)
    environment="asia-northeast1-gcp"  # next to API key in console
)

# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine'
    )
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [26]:
from tqdm.auto import tqdm
import datetime
from time import sleep

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(chunks), batch_size)):
    # find end of batch
    i_end = min(len(chunks), i+batch_size)
    meta_batch = chunks[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = openai.Embedding.create(input=texts, engine=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = openai.Embedding.create(input=texts, engine=embed_model)
                done = True
            except:
                pass
    embeds = [record['embedding'] for record in res['data']]
    # cleanup metadata
    meta_batch = [{
        'text': x['text'],
        'chunk': x['chunk'],
        'url': x['url']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

100%|██████████| 4/4 [00:20<00:00,  5.19s/it]


In [30]:
query = "how do I use the pinecone"

res = openai.Embedding.create(
    input=[query],
    engine=embed_model
)

# retrieve from Pinecone
xq = res['data'][0]['embedding']

# get relevant contexts (including the questions)
res = index.query(xq, top_k=12, include_metadata=True)

In [31]:
# get list of retrieved text
contexts = [item['metadata']['text'] for item in res['matches']]

augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

In [32]:
# system message to 'prime' the model
primer = f"""You are Q&A bot. A highly intelligent system that answers
user questions based on the information provided by the user above
each question. If the information can not be found in the information
provided by the user you truthfully say "I don't know".
"""

res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": augmented_query}
    ]
)

from IPython.display import Markdown

display(Markdown(res['choices'][0]['message']['content']))

To use Pinecone, you first need to sign up for an account on their website and get an API key. Once you have your API key, you can start using Pinecone for high-performance vector search applications. Pinecone's API allows you to create indexes for your data, add and query vector embeddings, and perform filtering and searching on metadata associated with each embedding. Pinecone supports a variety of use cases such as semantic text search, question-answering, image similarity search, and recommendation systems. To get started with Pinecone, you can refer to their documentation and example applications on their website.

In [33]:
res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": query}
    ]
)
display(Markdown(res['choices'][0]['message']['content']))

Pinecones have different uses. Some people use them as decorations, while others use them to start fires. If you want to use pinecones as decorations, you can simply place them in a bowl or use them to make a festive wreath. If you plan to use them to start a fire, you can gather dry pinecones and place them under your kindling or firewood. The pinecones will help your fire burn longer and produce more heat.

In [34]:
res = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are Q&A bot. A highly intelligent system that answers user questions"},
        {"role": "user", "content": query}
    ]
)
display(Markdown(res['choices'][0]['message']['content']))

A pinecone can be used for various purposes like craft projects, decoration, or even as a flavoring ingredient in some recipes. To use a pinecone, you can follow these steps:

1. Collect the pinecones: Find pinecones in areas where pine trees grow, such as forests or parks.

2. Clean the pinecones: Remove any dirt, debris, or bugs from the pinecones by gently brushing them off with a soft brush.

3. Prepare the pinecones: You can open up the pinecones by microwaving them for a few seconds to expand the cone's scales, or you can leave them as they are for a more natural look.

4. Use the pinecones: Depending on the purpose, you can tie a string around the pinecone and hang them as ornaments, use them in table centerpieces or wreaths, or add them to potpourri or sachets for a pleasant scent. In cooking, you can grind the pinecones and use them as a flavored seasoning.

In [35]:
#pinecone.delete_index(index_name)