In [6]:
# This workbook covers the basics of working with pinecone
# See https://www.youtube.com/watch?v=Q6616MuRmKU
!pip install pinecone-client

Defaulting to user installation because normal site-packages is not writeable
Collecting pinecone-client
  Obtaining dependency information for pinecone-client from https://files.pythonhosted.org/packages/df/d4/cffbb61236c6c1d7510e835c1ff843e4e7d705ed59d21c0e5b6dc1cb4fd8/pinecone_client-2.2.4-py3-none-any.whl.metadata
  Using cached pinecone_client-2.2.4-py3-none-any.whl.metadata (7.8 kB)
Collecting loguru>=0.5.0 (from pinecone-client)
  Obtaining dependency information for loguru>=0.5.0 from https://files.pythonhosted.org/packages/03/0a/4f6fed21aa246c6b49b561ca55facacc2a44b87d65b8b92362a8e99ba202/loguru-0.7.2-py3-none-any.whl.metadata
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting dnspython>=2.0.0 (from pinecone-client)
  Obtaining dependency information for dnspython>=2.0.0 from https://files.pythonhosted.org/packages/f6/b4/0a9bee52c50f226a3cbfb54263d02bb421c7f2adc136520729c2c689c1e5/dnspython-2.4.2-py3-none-any.whl.metadata
  Downloading dnspython-2.4.2-py3-

In [1]:
# Get environment variables
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())

True

Creating embeddings

In [3]:
# Create embedder object
import os
from langchain.embeddings.openai import OpenAIEmbeddings

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'OPENAI_API_KEY'

model_name = 'text-embedding-ada-002'
embedder = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [4]:
# Will have to get texts from jsonl or json file created during chunking
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

result = embedder.embed_documents(texts)
len(result), len(result[0])

(2, 1536)

In [5]:
# The result is just a bunch of vectors. No text is included.
print(result)

[[0.003042488170397273, -0.009244673312589876, -0.009786147017888436, -0.03269973194642018, 0.0003900096684716083, 0.026360526910249826, -0.013933043994962485, -0.006748611620636086, -0.020932581841329418, -0.03647684079889781, -0.000640936505382856, 0.03925024102316199, -0.015200884070874039, 0.009700304227537364, 0.013285916406733161, 0.0022599923886739374, 0.010453084767374936, 0.008689993454001543, 0.010770044553522196, -0.009495600469643239, -0.012420878907319938, 0.011998265238241915, -0.008452272450237953, -0.00800984955122527, -0.002682606057751961, -0.02892262123129751, 0.008313603184082757, -0.021064648962162244, -0.0060948809557618755, -0.007263672088032584, -0.018383693673571504, -0.01032101764654211, 0.002345835877516897, -0.02188346213109371, -0.002027224856547158, -0.0023970118169904285, -0.014949957845143193, -0.017696945762827827, 0.019308161656756254, -0.02624166594270677, 0.019942081229050772, -0.00027507029632045866, -0.00324719146263014, -0.00672880145937891, -0.00

Vector databse

In [6]:
# Give the index a name
index_name = 'langchain-retrieval-augmentation'

In [7]:
import pinecone # !install pinecone-client
import tqdm

# find API key in console at app.pinecone.io
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') or 'PINECONE_API_KEY'
# find ENV (cloud region) next to API key in console
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

# Initialise the pinecone client object
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)



  from tqdm.autonotebook import tqdm


In [8]:
# Create the index if it does not exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(result[0])  # 1536 dim of text-embedding-ada-002
    )

In [10]:
# Retrieve the index
# index = pinecone.GRPCIndex(index_name)
index = pinecone.Index(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

Get some data

In [12]:
!pip install -Uq apache_beam
!pip install -Uq datasets==2.12.0
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:10]') # Just 10 docs for now
data

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\donnp\\AppData\\Roaming\\Python\\Python311\\site-packages\\~yarrow\\arrow.dll'
Check the permissions.


[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Downloading and preparing dataset wikipedia/20220301.simple to C:/Users/donnp/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235M [00:00<?, ?B/s]

ModuleNotFoundError: No module named 'apache_beam'

Indexing

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeddings = embedder.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeddings, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeddings = embedder.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeddings, metadatas))
    
index.describe_index_stats()

Creating a Vector Store and Querying

In [13]:
from langchain.vectorstores import Pinecone

index_name = 'langchain-retrieval-augmentation'
text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

# Create vector store
vectorstore = Pinecone(
    index, embedder.embed_query, text_field
)



In [20]:
# Query the vector store
query = "What are some of the common topics I write about?"
query = "Summarise what I have written about sex and polyamory?"
# query = "What are some of the common topics I write about?"
# query = "What are some of the common topics I write about?"
# query = "What are some of the common topics I write about?"
# query = "What are some of the common topics I write about?"

vectorstore.similarity_search(
    query,  # our search query
    k=6  # return k most relevant docs
)

[Document(page_content="the polyamorous community is incredibly diverse. Not all poly relationships are hypersexual; in fact, many focus on emotional connections, deep love, and support rather than constant carnal activities. Polyamory isn't just about sexit's about personal growth and exploring the many facets of love and connection. People choose this lifestyle for various reasons, and for some, it's about much more than what happens in the bedroom. Believe it or not, some individuals intentionally choose polyamory as a path to a more balanced, less sex-focused life. They seek meaningful connections without the constant pressure to perform sexually, debunking the notion that polyamory is all about more action. The idea that becoming polyamorous guarantees more sex is, indeed, a myth. While polyamorous individuals might have more opportunities for intimacy and connection, that doesn't necessarily translate into a higher frequency of sexual activity. It's all about choice, communicatio

Generative Question-Answering

In [21]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa.run(query)

"Polyamory is not solely about sex; it focuses on emotional connections, deep love, and support. While polyamorous individuals may have more opportunities for intimacy, it doesn't necessarily mean they have more sex. The frequency of sexual activity in polyamorous relationships varies based on individual dynamics and choices. Polyamory is about choice, communication, and exploring love and personal growth in all its forms. It's important to debunk the myth that polyamory guarantees more sex and to approach the topic with understanding and respect for the diverse experiences within the polyamorous community."

In [17]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa_with_sources(query)

ValueError: Document prompt requires documents to have metadata variables: ['source']. Received document with missing metadata: ['source'].