## Installs and setup
- Install Cohere for embeddings, Umap to reduce embeddings to 2 dimensions, 
- Altair for visualization, Annoy for approximate nearest neighbor search

In [None]:
!pip install cohere umap-learn altair annoy datasets tqdm

In [None]:
import cohere
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

# Get your Cohere API key

Get your key by signing up here: https://os.cohere.ai/

In [None]:
# Paste your API key here. Remember to not share publicly
api_key = ''

# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

## Get dataset

In [None]:
# Get dataset
datas = load_dataset("trec", split="train")

# Import into a pandas dataframe, take only the first 100 rows
dt = pd.DataFrame(datas)[:1000]

# Preview the data to ensure it has loaded correctly
dt.head(10)

## Embed the archive

In [None]:
# Get the embeddings
embeds = co.embed(texts=list(dt['text']),
                  model="large",
                  truncate="LEFT").embeddings

# Building the search index from the embeddings

In [None]:
# Create the search index, pass the size of embedding
search_index = AnnoyIndex(embeds.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(10) # 10 trees
search_index.save('test.ann')

# Find the neighbors of an example from the dataset

In [None]:
# Choose an example (we'll retrieve others similar to it)
example_id = 92

# Retrieve nearest neighbors
similar_item_ids = search_index.get_nns_by_item(example_id,10,
                                                include_distances=True)
# Format and print the text and distances
results = pd.DataFrame(data={'texts': dt.iloc[similar_item_ids[0]]['text'], 
                             'distance': similar_item_ids[1]}).drop(example_id)

print(f"Question:'{dt.iloc[example_id]['text']}'\nNearest neighbors:")
results

# Find the neighbors of a user query

In [None]:
query = "What is the tallest mountain in the world?"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="large",
                  truncate="LEFT").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10,
                                                include_distances=True)
# Format the results
results = pd.DataFrame(data={'texts': dt.iloc[similar_item_ids[0]]['text'], 
                             'distance': similar_item_ids[1]})


print(f"Query:'{query}'\nNearest neighbors:")
results