# Splitting and Embedding Text Using LangChain (Similarity Search)

In [1]:
pip install -r ./files/requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [4]:
pip install --upgrade langchain langchain-openai langchain-pinecone pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('./files/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[0])

page_content='Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940'


In [8]:
print(chunks[0].page_content)

Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940


In [9]:
print(f'Now you have {len(chunks)}')

Now you have 300


## Embedding cost

In [10]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

# check prices here: https://openai.com/pricing
print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.000096


## Creating Text Embeddings

In [11]:
from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings()
# embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[0.021068239584565163, 0.042090147733688354, 0.07936204969882965, 0.019168738275766373, 0.00019255606457591057, -0.03076264075934887, -0.003900346113368869, -0.0014955670339986682, -0.009810224175453186, 0.018045254051685333, 0.032638974487781525, -0.0026060219388455153, 0.01694493368268013, -0.035372402518987656, -0.004601076245307922, 0.016678540036082268, -0.013597643002867699, 0.003932197578251362, 0.0098681366071105, 0.02090608701109886, -0.02606021799147129, -0.004531582351773977, -0.04273875802755356, -0.018856016919016838, -0.03412151336669922, 0.007708033546805382, -0.0034428443759679794, -0.04714003950357437, 0.023002486675977707, -0.012914285995066166, 0.030554158613085747, -0.01812632940709591, -0.0016157336067408323, 0.036831777542829514, 0.0278438962996006, -0.05295436456799507, 0.04440661147236824, -0.013215426355600357, -0.04257660731673241, 0.002021114807575941, -0.006636668927967548, -0.04859941080212593, 0.02997504360973835, 0.04178901016712189, 0.01923823356628418, 

## Inserting the Embeddings into a Pinecone Index

In [13]:
# Import the necessary libraries and initialize the Pinecone client
import os
import pinecone
from langchain_community.vectorstores import Pinecone

pc = pinecone.Pinecone()

In [14]:
# Delete all indexes
indexes = pc.list_indexes().names()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes ... Done


In [15]:
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone()
index_name = "churchill-speech"

In [16]:
if index_name not in pc.list_indexes().names():
    print(f'Creating index: {index_name}...')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print('Index created!')
else:
    print(f'Index {index_name} already exists!')

Creating index: churchill-speech...
Index created!


In [17]:
# processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
# inserting the embeddings into the index and returning a new Pinecone vector store object.
vector_store = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embeddings,
    index_name=index_name
)

In [18]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 300}},
 'total_vector_count': 300,
 'vector_type': 'dense'}

In [19]:
# Loading the vector store from an existing index
vector_store = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x10883b4d0>

## Asking Questions (Similarity Search)

In [20]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(id='63fc7c39-14fc-4e4b-a756-6f07113a731c', metadata={}, page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(id='8b5c783a-ba95-4487-b9df-b2e9fba372e7', metadata={}, page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(id='ac28996c-262c-4b81-ba30-2c37dd5ad4a6', metadata={}, page_content='streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a'), Document(id='6aa1c8c9-fb4a-475d-9e77-567a66ae15a2', metadata={}, page_content='number of the enemy, and fought fiercely on some of the old grounds that so many of us knew so')]


In [21]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a
--------------------------------------------------
number of the enemy, and fought fiercely on some of the old grounds that so many of us knew so
--------------------------------------------------


### Answering in Natural Language using an LLM

In [22]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# Initialize the LLM with the specified model and temperature
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.2)

# Use the provided vector store with similarity search and retrieve top 3 results
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Create a RetrievalQA chain using the defined LLM, chain type 'stuff', and retriever
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [23]:
query = 'Where shoud we fight?'
answer = qa_chain.invoke(query)
print(answer["result"])

We shall fight on the beaches, landing grounds, fields, in France, on the seas and oceans, in the streets, and in the hills.


In [24]:
query = 'Who was the king of Belgium at that time?'
answer = qa_chain.invoke(query)
print(answer["result"])

The king of Belgium at that time was King Leopold.
