# Semantic Search with Cohere Embed Jobs

In [None]:
!pip install "cohere>5" hnswlib -q

In [1]:
import cohere
import hnswlib
co = cohere.Client('COHERE_API_KEY')

## Step 1: Upload a dataset

In [2]:
# Upload a dataset for embed jobs
# This sample dataset has wikipedia articles on the following: Youtube, United States, United Kingdom, Elizabeth II, Wikipedia, 2022 FIFA World Cup, Microsoft Office, India, Christiano Ronaldo, Cleopatra, Instagram, Facebook, and Ukraine

dataset_file_path = "data/embed_jobs_sample_data.jsonl" # Full path - https://raw.githubusercontent.com/cohere-ai/notebooks/main/notebooks/data/embed_jobs_sample_data.jsonl

ds=co.datasets.create(
	name='sample_file',
	data=open(dataset_file_path, 'rb'),
	keep_fields = ['id','wiki_id'],
    type="embed-input"
	)

In [3]:
print(co.wait(ds))

...
...


## Step 2: Create embeddings via Cohere's Embed Jobs endpoint

In [4]:
# Dataset has been uploaded, create an embed job and specify the input type as "search document" since this will live in your Pinecone DB
job = co.embed_jobs.create(
    dataset_id=ds.id,
    input_type='search_document' ,
    model='embed-english-v3.0', 
    embedding_types=['float'])

co.wait(job) # poll the server until the job is completed 

...
...
...
...
...
...
...




In [None]:
print(job)

## Step 3: Download and prepare the embeddings

In [5]:
# Save down the output of the job
embeddings_file_path = 'embed_jobs_output.jsonl'

my_embed_jobs = co.embed_jobs.list()
emb_result = co.wait(job)
output_dataset = co.datasets.get(id=emb_result.output_dataset_id)
co.utils.save_dataset(output_dataset.dataset, embeddings_file_path, format="jsonl")


In [6]:
import json
def load_jsonl(file_path):
  lines = []
  with open(file_path, 'r') as file:
    for line in file:
      lines.append(json.loads(line))
  return lines

data_lines = load_jsonl(embeddings_file_path)
embeddings = []
texts = []
for data in data_lines:
  embeddings.append(data['embeddings']['float'])
  texts.append(data['text'])

            
        

## Step 4: Initialize Hnwslib index and add embeddings

In [7]:
# Create the hnsw index
index = hnswlib.Index(space='ip', dim=1024)
index.init_index(max_elements=len(embeddings), ef_construction=512, M=64)
index.add_items(embeddings,list(range(len(embeddings))))

## Step 5: Query the index and rerank the results

In [8]:
# Query the Database
query = "What was the first youtube video about?"

# Convert the query into embeddings
query_emb=co.embed(
    texts=[query], model="embed-english-v3.0", input_type="search_query"
        ).embeddings

# Retrieve the initial results from your vector db
doc_index = index.knn_query(query_emb, k=10)[0][0]

# From the doc_index, get the text from each index and then pass the text into rerank
docs_to_rerank = []
for index in doc_index:
  docs_to_rerank.append(texts[index])

final_result = co.rerank(
    query= query,
    documents=docs_to_rerank,
    return_documents = True,
    model="rerank-english-v2.0",
    top_n=3)

## Step 6: Display the results

In [13]:
# Output Results
for idx, r in enumerate(final_result.results):
  print(f"Document Rank: {idx + 1}, Document Index: {r.index}")
  print(f"Document: {r.document.text}")
  print(f"Relevance Score: {r.relevance_score:.5f}")
  print("\n")

Document Rank: 1, Document Index: 0
Document: youtube began as a venture capital – funded technology startup. between november 2005 and april 2006, the company raised money from various investors, with sequoia capital, $ 11. 5 million, and artis capital management, $ 8 million, being the largest two. youtube ' s early headquarters were situated above a pizzeria and a japanese restaurant in san mateo, california. in february 2005, the company activated codice _ 1. the first video was uploaded april 23, 2005. titled " me at the zoo ", it shows co - founder jawed karim at the san diego zoo and can still be viewed on the site. in may, the company launched a public beta and by november, a nike ad featuring ronaldinho became the first video to reach one million total views. the site launched officially on december 15, 2005, by which time the site was receiving 8 million views a day. clips at the time were limited to 100 megabytes, as little as 30 seconds of footage.
Relevance Score: 0.91490
