In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')
os.environ['ACTIVELOOP_TOKEN'] = os.environ.get('ACTIVELOOP_TOKEN')

In [2]:
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_openai import OpenAIEmbeddings

# Define the documents
documents = [
    "The cat is on the mat.",
    "There is a cat on the mat.",
    "The dog is in the yard.",
    "There is a dog in the yard.",
]

# Initialize the OpenAIEmbeddings instance
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Generate embeddings for the documents
document_embeddings = embeddings.embed_documents(documents)

# Perform a similarity search for a given query
query = "A cat is sitting on a mat."
query_embedding = embeddings.embed_query(query)

# Calculate similarity scores
similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0]

# Find the most similar document
most_similar_index = np.argmax(similarity_scores)
most_similar_document = documents[most_similar_index]

print(f"Most similar document to the query '{query}':")
print(most_similar_document)


Most similar document to the query 'A cat is sitting on a mat.':
The cat is on the mat.


In [3]:
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

documents = ["Document 1", "Document 2", "Document 3"]
doc_embeddings = hf.embed_documents(documents)

  return self.fget.__get__(instance, owner)()


In [4]:
# Print the embeddings
for text, embedding in zip(documents, doc_embeddings):
    print(f"Text: {text}")
    print(f"Embedding: {embedding[:5]}")  # print first 5 dimensions of each embedding

Text: Document 1
Embedding: [0.03201056271791458, 0.038009535521268845, 0.008137303404510021, -0.05043879523873329, -0.05071724206209183]
Text: Document 2
Embedding: [0.03084605187177658, 0.061166975647211075, 0.0210378747433424, -0.04848083481192589, -0.0537794791162014]
Text: Document 3
Embedding: [0.04238653555512428, 0.008868656121194363, 0.023097645491361618, -0.049794841557741165, -0.035510122776031494]


## Embedding with cohere

In [5]:
import cohere
from langchain.embeddings import CohereEmbeddings

# Initialize the CohereEmbeddings object
cohere = CohereEmbeddings(
	model="embed-multilingual-v2.0",
	cohere_api_key="lmAnD5JRWeE7F2RI2CmW2sQrTqp3PhNtypCLKFyi"
)

# Define a list of texts
texts = [
    "Hello from Cohere!", 
    "مرحبًا من كوهير!", 
    "Hallo von Cohere!",  
    "Bonjour de Cohere!", 
    "¡Hola desde Cohere!", 
    "Olá do Cohere!",  
    "Ciao da Cohere!", 
    "您好，来自 Cohere！", 
    "कोहेरे से नमस्ते!"
]

# Generate embeddings for the texts
document_embeddings = cohere.embed_documents(texts)

# Print the embeddings
for text, embedding in zip(texts, document_embeddings):
    print(f"Text: {text}")
    print(f"Embedding: {embedding[:5]}")  # print first 5 dimensions of each embedding

Text: Hello from Cohere!
Embedding: [0.23461914, 0.50146484, -0.048828125, 0.13989258, -0.18029785]
Text: مرحبًا من كوهير!
Embedding: [0.25317383, 0.30004883, 0.0104904175, 0.12573242, -0.18273926]
Text: Hallo von Cohere!
Embedding: [0.10266113, 0.28320312, -0.050201416, 0.23706055, -0.07159424]
Text: Bonjour de Cohere!
Embedding: [0.15185547, 0.28173828, -0.057281494, 0.11743164, -0.04385376]
Text: ¡Hola desde Cohere!
Embedding: [0.25146484, 0.43139648, -0.0859375, 0.24682617, -0.11706543]
Text: Olá do Cohere!
Embedding: [0.18664551, 0.39038086, -0.045898438, 0.14562988, -0.11254883]
Text: Ciao da Cohere!
Embedding: [0.115722656, 0.43310547, -0.026168823, 0.14575195, 0.07080078]
Text: 您好，来自 Cohere！
Embedding: [0.24609375, 0.30859375, -0.111694336, 0.26635742, -0.051086426]
Text: कोहेरे से नमस्ते!
Embedding: [0.1932373, 0.6352539, 0.03213501, 0.117370605, -0.26098633]


## Storing in Deep Lake

In [6]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

create some documents using the RecursiveCharacterTextSplitter class.

In [7]:
# create our documents
texts = [
    "Napoleon Bonaparte was born in 15 August 1769",
    "Louis XIV was born in 5 September 1638",
    "Lady Gaga was born in 28 March 1986",
    "Michael Jeffrey Jordan was born in 17 February 1963"
]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.create_documents(texts)
# Print the embeddings
for text, embedding in zip(texts, document_embeddings):
    print(f"Text: {text}")
    print(f"Embedding: {embedding[:5]}")  # print first 5 dimensions of each embedding
    print("------------------------------------------------")

Text: Napoleon Bonaparte was born in 15 August 1769
Embedding: [0.23461914, 0.50146484, -0.048828125, 0.13989258, -0.18029785]
------------------------------------------------
Text: Louis XIV was born in 5 September 1638
Embedding: [0.25317383, 0.30004883, 0.0104904175, 0.12573242, -0.18273926]
------------------------------------------------
Text: Lady Gaga was born in 28 March 1986
Embedding: [0.10266113, 0.28320312, -0.050201416, 0.23706055, -0.07159424]
------------------------------------------------
Text: Michael Jeffrey Jordan was born in 17 February 1963
Embedding: [0.15185547, 0.28173828, -0.057281494, 0.11743164, -0.04385376]
------------------------------------------------


The next step is to create a Deep Lake database and load our documents into it.

In [9]:
# initialize embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# create Deep Lake dataset
# TODO: use your organization id here. (by default, org id is your username)
my_activeloop_org_id = "e2tovar"
my_activeloop_dataset_name = "langchain_course_embeddings3"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding=embeddings)

# add documents to our Deep Lake dataset
db.add_documents(docs)

Your Deep Lake dataset has been successfully created!


Creating 4 embeddings in 1 batches of size 4:: 100%|██████████| 1/1 [00:08<00:00,  8.75s/it]

Dataset(path='hub://e2tovar/langchain_course_embeddings3', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (4, 1)      str     None   
 metadata     json      (4, 1)      str     None   
 embedding  embedding  (4, 1536)  float32   None   
    id        text      (4, 1)      str     None   





['cbc343c9-d5bc-11ee-b341-482ae342e600',
 'cbc343ca-d5bc-11ee-b4d0-482ae342e600',
 'cbc343cb-d5bc-11ee-b05e-482ae342e600',
 'cbc343cc-d5bc-11ee-9fd9-482ae342e600']

Retriving

In [12]:
# create retriever from db
retriever = db.as_retriever()

Finally, we create a RetrievalQA chain in LangChain and run it

In [14]:
# istantiate the llm wrapper
model = ChatOpenAI(model='gpt-3.5-turbo')

# create the question-answering chain
qa_chain = RetrievalQA.from_llm(model, retriever=retriever)

# ask a question to the chain
qa_chain.invoke("When was Michael Jordan born?")

{'query': 'When was Michael Jordan born?',
 'result': 'Michael Jordan was born on 17 February 1963.'}

Let's break down each step to understand how these technologies work together.

1-OpenAI and LangChain Integration: LangChain, a library built for chaining NLP models, is designed to work seamlessly with OpenAI's GPT-3.5-turbo model for language understanding and generation. You've initialized OpenAI embeddings using OpenAIEmbeddings(), and these embeddings are later used to transform the text into a high-dimensional vector representation. This vector representation captures the semantic essence of the text and is essential for information retrieval tasks.

2-Deep Lake: Deep Lake is a Vector Store for creating, storing, and querying vector representations (also known as embeddings) of data.

3-Text Retrieval: Using the db.as_retriever() function, you've transformed the Deep Lake dataset into a retriever object. This object is designed to fetch the most relevant pieces of text from the dataset based on the semantic similarity of their embeddings.

4-Question Answering: The final step involves setting up a RetrievalQA chain from LangChain. This chain is designed to accept a natural language question, transform it into an embedding, retrieve the most relevant document chunks from the Deep Lake dataset, and generate a natural language answer. The ChatOpenAI model, which is the underlying model of this chain, is responsible for both the question embedding and the answer generation.