# Generate embeddings

In [1]:
import datasets

In [2]:
squad_dataset = datasets.load_dataset('squad')

In [3]:
data = squad_dataset["train"].to_pandas()

In [4]:
data.drop_duplicates(subset='context', keep='first', inplace=True)

In [5]:
len(data)

18891

In [6]:
data.head(3)

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
5,5733bf84d058e614000b61be,University_of_Notre_Dame,"As at most other universities, Notre Dame's st...",When did the Scholastic Magazine of Notre dame...,"{'text': ['September 1876'], 'answer_start': [..."
10,5733bed24776f41900661188,University_of_Notre_Dame,The university is the major seat of the Congre...,Where is the headquarters of the Congregation ...,"{'text': ['Rome'], 'answer_start': [119]}"


Note that we will use the context column to generate the embeddings.

In [13]:
data.iloc[42]["question"]

'In what year did the initial degrees get handed out at Notre Dame?'

## LangChain

In [7]:
import json

from tqdm.notebook import tqdm
import tensorflow_hub as hub
from langchain.embeddings import TensorflowHubEmbeddings

2023-08-02 16:26:08.302467: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-02 16:26:08.333162: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-02 16:26:08.333898: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
embeddings = TensorflowHubEmbeddings()

In [9]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
model = hub.load(module_url)

In [10]:
with open("./data/embeddings.json", "w") as f:
    for index, row in tqdm(data.iterrows(), total=len(data)):
        embeddings = model([row["context"]])
        new = {
            "id": row["id"],
            "embedding": [float(x) for x in list(embeddings.numpy()[0])]
        }
        json.dump(new, f)
        f.write("\n")

  0%|          | 0/18891 [00:00<?, ?it/s]

Take a look a the first line of the file

In [11]:
!head -n1 ./data/embeddings.json

{"id": "5733be284776f41900661182", "embedding": [0.049244094640016556, 0.05186053737998009, -0.03633502125740051, 0.04245924949645996, 0.03653573989868164, 0.0245915949344635, -0.03140788525342941, 0.03027556836605072, 0.0699005126953125, -0.07061614096164703, -0.01452316902577877, 0.022638365626335144, 0.05143193155527115, -0.012333580292761326, 0.021940043196082115, -0.018177784979343414, -0.015822330489754677, -0.05141735449433327, 0.06755772233009338, 0.059448592364788055, -0.0008403332903981209, 0.004648773930966854, 0.04808363318443298, 0.061812371015548706, -0.026148028671741486, 0.040365252643823624, 0.01215648464858532, 0.01699242927134037, -0.024787873029708862, -0.06204745173454285, -0.04726498946547508, 0.002844578120857477, -0.03173746168613434, -0.030263183638453484, 0.03417026624083519, -0.030377527698874474, -0.06122349947690964, -0.037982601672410965, 0.008448370732367039, 0.03224416822195053, -0.006424921564757824, 0.06323166936635971, 0.03328016400337219, 0.033746555