# Generate embeddings

In [1]:
import datasets

In [2]:
squad_dataset = datasets.load_dataset('squad')

In [3]:
data = squad_dataset["train"].to_pandas()

In [4]:
data.drop_duplicates(subset='context', keep='first', inplace=True)

In [5]:
len(data)

18891

In [6]:
data.head(3)

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
5,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,"{'text': ['September 1876'], 'answer_start': [..."
10,5733bed24776f41900661188,University_of_Notre_Dame,The university is the major seat of the Congre...,Where is the headquarters of the Congregation ...,"{'text': ['Rome'], 'answer_start': [119]}"


## LangChain

In [7]:
import json

from tqdm.notebook import tqdm
import tensorflow_hub as hub
from langchain.embeddings import TensorflowHubEmbeddings

2023-07-31 23:33:48.831627: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-31 23:33:48.864027: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-31 23:33:48.864490: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
embeddings = TensorflowHubEmbeddings()

In [9]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
model = hub.load(module_url)

In [11]:
with open("./data/embeddings.json", "w") as f:
    for index, row in tqdm(data.iterrows(), total=len(data)):
        embeddings = model([row["question"]])
        new = {
            "id": row["id"],
            "embedding": [float(x) for x in list(embeddings.numpy()[0])]
        }
        json.dump(new, f)
        f.write("\n")

  0%|          | 0/18891 [00:00<?, ?it/s]

Take a look a the first line of the file

In [12]:
!head -n1 ./data/embeddings.json

{"id": "5733be284776f41900661182", "embedding": [-0.017896970734000206, 0.06270504742860794, 0.005160084459930658, -0.059968288987874985, 0.04375344142317772, 0.016342133283615112, 0.05349290370941162, 0.03941403329372406, 0.07059962302446365, -0.07120203226804733, -0.03565110266208649, -0.017518026754260063, 0.06243707612156868, 0.015335098840296268, 0.04969439283013344, -0.0318908616900444, -0.05461360514163971, -0.0349762961268425, 0.0502435564994812, 0.013538036495447159, 0.012199447490274906, -0.04340279847383499, 0.06497767567634583, 0.07574842870235443, 0.0740252211689949, 0.04883730411529541, -0.0028725431766361, 0.010363751091063023, -0.01908387430012226, -0.0668087899684906, -0.04329714924097061, 0.060762714594602585, -0.033792510628700256, -0.06321902573108673, -0.04491347819566727, 0.0154750756919384, -0.0332784578204155, -0.056721173226833344, -0.04757324233651161, 0.06218024715781212, 0.021218128502368927, 0.04435649514198303, -0.06075049191713333, 0.033897850662469864, 0