<a href="https://colab.research.google.com/github/goodScienceRice/Open-Assistant/blob/main/Knowledge_Embedding_Community_Blog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Pinecone

### Connect Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

###Installing Pinecone Packages

In [None]:
!pip install pandas==1.5.3
!pip install -qU pinecone-client pandas

In [None]:
import pinecone

###Installing Other required packages

In [None]:
!pip install openai
!pip install langchain
!pip install tiktoken

In [None]:
import os
import openai
OPENAI_API_KEY=''
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

from tqdm.auto import tqdm
from time import sleep
from uuid import uuid4
import tiktoken

###Pinecone keys configuration

In [None]:
pinecone.init(api_key="", environment="")

###Pinecone Index details

In [None]:
pinecone.list_indexes()

['knowledge']

In [None]:
pinecone.Index('knowledge').describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 17}},
 'total_vector_count': 17}

In [None]:
#pinecone.delete_index("knowledge")

###PDF, CSV, TXT Files are Supported

PDF Upload

In [None]:
!pip install PyPDF2
import PyPDF2

In [None]:
# importing required modules
from PyPDF2 import PdfReader

# creating a pdf reader object
reader = PdfReader('stevejobs.pdf')

# getting a specific page from the pdf file
page = reader.pages

In [None]:
text = ""
for page in reader.pages:
   text+=page.extract_text()
#print(type(text), len(text))
print(text)

CSV Upload



In [None]:
import csv

filename = 'Pinecone test - Sheet1.csv'

# If you want to store lines in a list:
lines_string = ""
with open(filename, 'r') as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        lines_string += ' '.join(row) + "\n"

text = lines_string
print(text)

TXT Upload

In [None]:
with open('stevejobs.txt') as f:
    text = f.read()

###Text Chunking

In [None]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=10)
input_chunks = text_splitter.split_text(text)
i=0
for item in input_chunks:
  i+=1
  print(f'\nChunk {i}:\n')
  print(item)


In [None]:
len(input_chunks)

189

###Embedding Model and Batch Size

In [None]:
embed_model = "text-embedding-ada-002"
batch_size=6

###Creating Embedding

In [None]:
def create_embeddings(texts):
  res = openai.Embedding.create(input=texts, engine=embed_model)
  for record in res['data']:
    embeds = record['embedding']
  return embeds

In [None]:
final_chunks=[]
final_chunks.extend([{
   "id": str(uuid4()),
   "text": input_chunks[i],
   "chunk": i,
   "knowledge_name": "knowledge",
   "embeds" : create_embeddings(input_chunks[i]),
} for i in range(len(input_chunks))])

In [None]:
len(final_chunks)

###Pinecone Index creation and Upsert

In [None]:
pinecone.create_index("knowledge", dimension=1536, metric="dotproduct")

In [None]:
index = pinecone.Index('knowledge')

In [None]:
for i in tqdm(range(0, len(final_chunks), batch_size)):
      # find end of batch
  i_end = min(len(final_chunks), i + batch_size) #1
  meta_batch = final_chunks[i:i_end]
  batch_id = [x['id'] for x in meta_batch]
      # get texts to encode
  texts = [x['text'] for x in meta_batch]
      # create embeddings (try-except added to avoid RateLimitError)
  try:
    res = openai.Embedding.create(input=texts, engine=embed_model)
  except:
    done = False
    while not done:
      sleep(5)
      try:
        res = openai.Embedding.create(input=texts, engine=embed_model)
        done = True
      except:
        pass
  embeds = [record['embedding'] for record in res['data']]
      # cleanup metadata
  meta_batch = [{
    'text': x['text'],
    'chunk': x['chunk'],
    'knowledge_name': x['knowledge_name'],
  } for x in meta_batch]
  to_upsert = list(zip(batch_id, embeds, meta_batch))
  print(to_upsert)
      # upsert to Pinecone
  index.upsert(vectors=to_upsert)



In [None]:
pinecone.Index('knowledge').describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 206}},
 'total_vector_count': 206}

##Qdrant

###Installing Qdrant packages

In [None]:
!pip install qdrant-client>=1.1.1

In [None]:
from qdrant_client import models, QdrantClient

In [None]:
from qdrant_client.http.models import Batch

###Qdrant API keys

In [None]:
qdrant_client = QdrantClient(
    api_key="", #Enter your api key
    url="" #Enter your url
)

###Installing Other required packages

In [None]:
!pip install langchain
!pip install openai
!pip install tiktoken

In [None]:
import os
import openai
OPENAI_API_KEY=''
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

from tqdm.auto import tqdm
from time import sleep
from uuid import uuid4
import tiktoken

###Qdrant Collection details

In [None]:
collection_info = qdrant_client.get_collection(collection_name="") #Enter your collection name

###PDF, CSV, TXT Files are Supported

PDF Upload

In [None]:
!pip install PyPDF2
import PyPDF2

In [None]:
# importing required modules
from PyPDF2 import PdfReader

# creating a pdf reader object
reader = PdfReader('stevejobs.pdf')

# getting a specific page from the pdf file
page = reader.pages

In [None]:
text = ""
for page in reader.pages:
   text+=page.extract_text()
#print(type(text), len(text))
print(text)

CSV Upload



In [None]:
import csv

filename = 'Pinecone test - Sheet1.csv'

# If you want to store lines in a list:
lines_string = ""
with open(filename, 'r') as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        lines_string += ' '.join(row) + "\n"

text = lines_string
print(text)

TXT Upload

In [None]:
with open('stevejobs.txt') as f:
    text = f.read()

In [None]:
query_res = openai.Embedding.create(
      input=[query],
      engine=embedding_model
    )

NameError: ignored

###Text Chunking

In [None]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=10)
input_chunks = text_splitter.split_text(text)
i=0
for item in input_chunks:
  i+=1
  print(f'\nChunk {i}:\n')
  print(item)

###Embedding Model

In [None]:
embedding_model = "text-embedding-ada-002"

###Creating Embedding

In [None]:
def create_embeddings(texts):
  res = openai.Embedding.create(input=texts, engine=embedding_model)
  for record in res['data']:
    embeds = record['embedding']
  return embeds

In [None]:
final_chunks=[]
final_chunks.extend([{
   "id": str(uuid4()),
   "text": input_chunks[i],
   "chunk": i,
   "embeds": create_embeddings(input_chunks[i])
} for i in range(len(input_chunks))])

###Qdrant Collection creation

In [None]:
# Create collection to store books
qdrant.recreate_collection(
    collection_name="",
    vectors_config=models.VectorParams(
        size=1536, # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

###Qdrant vector Upsert

In [None]:
# Let's vectorize descriptions and upload to qdrant

qdrant.upload_records(
    collection_name="",
    records=[
        models.Record(
            id=key['id'],
            vector=key['embeds'],
            payload=key
        ) for id, key in enumerate(final_chunks)
    ]
)