In [12]:
!pip install -qU \
  langchain==0.1.1 \
  langchain-community==0.0.13 \
  openai==0.27.7 \
  tiktoken==0.4.0 \
  pinecone-client==3.1.0 \
  pinecone-datasets==0.7.0

## Building the Knowledge Base

Download a pre-embedding dataset from `pinecone-datasets`to skip the embedding and preprocessing steps


In [13]:
import pinecone_datasets

dataset = pinecone_datasets.load_dataset('wikipedia-simple-text-embedding-ada-002-100K')
dataset.head()

Unnamed: 0,id,values,sparse_values,metadata,blob
0,1-0,"[-0.011254455894231796, -0.01698738895356655, ...",,,"{'chunk': 0, 'source': 'https://simple.wikiped..."
1,1-1,"[-0.0015197008615359664, -0.007858820259571075...",,,"{'chunk': 1, 'source': 'https://simple.wikiped..."
2,1-2,"[-0.009930099360644817, -0.012211072258651257,...",,,"{'chunk': 2, 'source': 'https://simple.wikiped..."
3,1-3,"[-0.011600767262279987, -0.012608098797500134,...",,,"{'chunk': 3, 'source': 'https://simple.wikiped..."
4,1-4,"[-0.026462381705641747, -0.016362832859158516,...",,,"{'chunk': 4, 'source': 'https://simple.wikiped..."


In [14]:
len(dataset)

100000

format the dataset ready for upsert and reduce to a subset of the full dataset.

In [15]:
# drop sparse_values 
dataset.documents.drop(['metadata'], axis=1, inplace=True)
dataset.documents.rename(columns={'blob': 'metadata'}, inplace=True)
# use rows of the dataset up to index 30_000
dataset.documents.drop(dataset.documents.index[30_000:], inplace=True)
len(dataset)

30000


initializing the Pinecone vector database


 set up the index to store it.

initializing connection to Pinecone using a [free API key](https://app.pinecone.io).

In [16]:
import os
from pinecone import Pinecone

# initialize connection to pinecone
api_key = os.environ.get('PINECONE_API_KEY') or 'enter my key here'

# configure client
pc = Pinecone(api_key=api_key)

define the cloud provider and region to deploy the index.

In [17]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [18]:
index_name = 'simple-wikipidia'

In [19]:
import time

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# create a new index
pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='dotproduct',
        spec=spec
    )

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

connect to the new index:

In [20]:
index = pc.Index(index_name)
# wait a moment for connection
time.sleep(1)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

 the new Pinecone index has a `total_vector_count` of `0`, since no vectors have been added yet.

upsert the data to Pinecone:

In [21]:
for batch in dataset.iter_documents(batch_size=100):
    index.upsert(batch)

check the number of vectors in the index like so:

In [22]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 30000}},
 'total_vector_count': 30000}

## Creating a Vector Store and Querying

initialize a LangChain vector store using the same index. Initialize a LangChain embedding object

In [23]:
from langchain.embeddings.openai import OpenAIEmbeddings

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or 'enter my key here'

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

  warn_deprecated(


initialize the vector store:

In [24]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pc.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)



query the vector store directly using `vectorstore.similarity_search`:

In [31]:
query = "what causes obesity?"

vectorstore.similarity_search(
    query,  # search query
    k=3  # return 3 most relevant docs
)

[Document(page_content="Obesity is the condition of being much too heavy for one's height so that one's health is affected. In other words, it means to be too overweight. Also known as being fat. It is considered a disease and has been described as an epidemic.\n\nTo know if a person is overweight, the body mass index (BMI) is calculated, by dividing the person's weight (in kilograms), by their height (in metres) squared (multiplied by itself). (This is only meaningful for adults who are fully grown, and should not be used for children. Growth charts can be used to measure obesity in children.)\n\nA BMI between 18.5 and 25 is considered normal. People with a BMI of 25 or more are said to be overweight; with 30 and above, they are considered obese, and with 35 and above, they are considered severely obese (this used to be called morbidly obese). In general, the BMI number is a good quantifiable measurement of a person's obesity. However, it is a poor predictor in people who are very ath



## Generative Question-Answering

GQA takes the query as a question that is to be answered by a LLM, but the LLM must answer the question based on the information it is seeing being returned from the `vectorstore`.
initialize a `RetrievalQA` object

In [32]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [33]:
qa.run(query)

'Obesity can be caused by various factors, including getting more calories than the body uses, poor nutrition, hormonal problems like hypothyroidism, metabolism issues, eating disorders, psychological problems like depression, lack of sleep, lack of exercise, and genetic factors. Additionally, low levels of the hormone leptin have been linked to obesity.'

include the sources of information that the LLM is using to answer the question using a slightly different version of `RetrievalQA` called `RetrievalQAWithSourcesChain`:

In [34]:
from langchain.chains import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [35]:
qa_with_sources(query)

{'question': 'what causes obesity?',
 'answer': 'Obesity is caused by getting more calories than are used by the body, poor nutrition, hormonal problems, metabolism issues, eating disorders, psychological problems, lack of sleep, lack of exercise, and genetics. \n',
 'sources': 'https://simple.wikipedia.org/wiki/Obesity'}

delete the index to save resources.

In [None]:
pc.delete_index(index_name)

---