In [2]:
from datasets import load_dataset

In [3]:
import cohere
COHERE_KEY = ""
co = cohere.Client(COHERE_KEY)

In [4]:
trec = load_dataset("trec", split = "train[:1000]")
trec

Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 1000
})

In [5]:
trec[0]

{'text': 'How did serfdom develop in and then leave Russia ?',
 'coarse_label': 2,
 'fine_label': 26}

In [6]:
embeds = co.embed(
    texts = trec["text"],
    model = "small",
    truncate = "LEFT"
).embeddings

In [10]:
print(embeds[0])

[2.6484375, 0.828125, 0.8696289, 0.27270508, -0.6147461, 2.3925781, -1.0908203, -0.6245117, -0.7993164, -1.2324219, 1.1728516, 0.7817383, 2.421875, 2.2246094, 1.5712891, -0.12322998, 1.4853516, -1.1220703, -0.73291016, 1.7353516, -0.6582031, 0.12634277, 0.33203125, 0.76416016, -0.7602539, -2.1875, 0.025512695, 2.1835938, 1.1064453, 1.8740234, -1.4619141, -2.6542969, 3.546875, 0.99609375, 1.3789062, -0.41333008, 1.6484375, -1.7255859, 1.4726562, 3.1855469, 2.2597656, -0.95166016, -2.9375, 0.1694336, 0.012794495, -0.89453125, -1.7177734, -1.3837891, 2.2402344, 2.3925781, 0.61279297, -0.4013672, 1.6220703, 0.22375488, 0.7026367, 0.09863281, 0.77001953, 0.22351074, -1.4140625, -0.70947266, -1.0517578, -1.1875, 0.7524414, -1.2724609, 2.3222656, -0.14916992, 1.5712891, 0.12237549, 0.6953125, -2.6855469, -0.7109375, -1.3173828, -0.49926758, 0.56591797, -1.9736328, -2.2324219, -1.28125, -0.33813477, -1.2441406, -2.8886719, -0.7363281, -1.4375, -0.28027344, 1.6748047, -1.7685547, -1.9023438, 0.

In [8]:
len(embeds[0])

1024

In [12]:
from pinecone import Pinecone, ServerlessSpec, PodSpec

pinecone = Pinecone(api_key = "")

# if "openai" not in pinecone.list_indexes():
#     pinecone.create_index("openai", dimension = len(embeds[0]), 
#                           spec = ServerlessSpec(
#                               cloud = "gcp-starter",
#                               region = "us-central1"
#                           ))
index = pinecone.Index("cohere")

In [17]:
from tqdm.auto import tqdm
count = 0
batch_size = 16
ids = [str(i) for i in range(len(embeds))]
meta = [{"text": text} for text in trec["text"]]
for i in tqdm(range(0, len(trec), batch_size)):
    i_end = min(i+batch_size, len(trec["text"]))
    to_upsert = zip(ids, embeds, meta)
    index.upsert(vectors = list(to_upsert)[i:i_end])

index.describe_index_stats()

100%|██████████| 63/63 [00:17<00:00,  3.64it/s]


{'dimension': 1024,
 'index_fullness': 0.00704,
 'namespaces': {'': {'vector_count': 704}},
 'total_vector_count': 704}

In [20]:
query = "What caused the 1929 Great Depression?"

xq = co.embed(
    texts = [query],
    model = "small",
    truncate = "LEFT"
).embeddings
print(xq)

[[1.5400391, -2.0390625, 2.1621094, -1.9482422, 2.7851562, 1.7753906, -0.15100098, -2.1464844, 1.2587891, -0.41015625, 2.671875, 0.5131836, 0.7758789, 0.85009766, -1.9990234, -0.61865234, 0.11029053, -2.2753906, -0.47265625, -1.7255859, -5.0078125, 0.4741211, -1.2324219, -0.7138672, -1.0634766, -2.2675781, -0.13671875, -2.15625, 0.24987793, 4.375, -1.0751953, -3.0019531, 0.19641113, -2.6464844, 0.07550049, 0.4074707, 3.5917969, -0.72558594, 1.4785156, 2.2363281, 0.30908203, -2.8847656, 2.4375, 0.27172852, 1.3173828, 3.3964844, -5.7734375, 0.068725586, -1.4980469, 3.0, -1.8193359, 1.4316406, 0.40551758, -2.875, -1.2451172, -1.4736328, 1.453125, -0.94091797, -1.0205078, -0.14367676, 0.66064453, -0.28466797, -1.2910156, 0.14746094, -0.8564453, -0.76220703, 0.61816406, 0.8696289, -0.07824707, 1.4316406, 0.30200195, 0.92529297, 1.1201172, -1.4814453, -2.4785156, -0.8696289, -1.4316406, -1.3369141, 0.34936523, 0.8984375, 2.6308594, -1.6035156, -0.7294922, -1.8496094, -2.1816406, 0.28759766, 

In [22]:
index.query(vector = xq, top_k = 5, include_metadata= True)

{'matches': [{'id': '932',
              'metadata': {'text': 'Why did the world enter a global '
                                   'depression in 1929 ?'},
              'score': 0.832151711,
              'values': []},
             {'id': '787',
              'metadata': {'text': "When was `` the Great Depression '' ?"},
              'score': 0.752715111,
              'values': []},
             {'id': '400',
              'metadata': {'text': 'What crop failure caused the Irish Famine '
                                   '?'},
              'score': 0.498864532,
              'values': []},
             {'id': '835',
              'metadata': {'text': 'What were popular songs and types of songs '
                                   'in the 1920s ?'},
              'score': 0.338593841,
              'values': []},
             {'id': '160',
              'metadata': {'text': 'What war did the Wanna-Go-Home Riots occur '
                                   'after ?'},
             