# Embeddings for RAG

### Importing Libraries and Loading Data

In [1]:
import numpy as np
from time import time

In [2]:
a_test = np.load("data/answers_test.npy", allow_pickle=True)
a_train = np.load("data/answers_train.npy", allow_pickle=True)
q_test = np.load("data/questions_test.npy", allow_pickle=True)
q_train = np.load("data/questions_train.npy", allow_pickle=True)

In [3]:
from hide_cell import toggle_code as hide_cell
# Pip install necessary package
!pip install -U --quiet  huggingface_hub
%pip install --upgrade --quiet  pgvector
%pip install --upgrade --quiet  langchain-openai
%pip install --upgrade --quiet  psycopg2-binary
%pip install --upgrade --quiet  tiktoken

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch, os, numpy as np, pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, Trainer, TrainingArguments

In [9]:
articles = pd.read_csv("data/articles.csv", sep='|', )
print(articles.shape)
articles.head()

(2067, 2)


Unnamed: 0,Titles,Content
0,Super_Bowl_50,Super Bowl 50 was an American football game to...
1,Super_Bowl_50,The Panthers finished the regular season with ...
2,Super_Bowl_50,The Broncos took an early lead in Super Bowl 5...
3,Super_Bowl_50,"CBS broadcast Super Bowl 50 in the U.S., and c..."
4,Super_Bowl_50,"In early 2012, NFL Commissioner Roger Goodell ..."


### Embeddings for Articles

In [12]:
texts = list(articles["Content"])

In [13]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')
model.eval()

print("Started tokenizing...")
all_article_embeddings = torch.randn(0)
start = time()

# Tokenize sentences
for i in range(articles.shape[0] // 100 + 1):
    #print(i)
    encoded_input = tokenizer(texts[i*100:min(i*100+100, articles.shape[0])], padding=True, truncation=True, return_tensors='pt')
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        text_embeddings = model_output[0][:, 0]
    # normalize embeddings
    sentence_embeddings = torch.nn.functional.normalize(text_embeddings, p=2, dim=1)
    all_article_embeddings = torch.cat((all_article_embeddings, text_embeddings), dim=0)
    
    if i % 2 == 0: 
        print("... +200 embeddings done. Time spend:", time() - start)
        start = time()
    #print("Sentence embeddings:", sentence_embeddings)
    #print(type(sentence_embeddings))

Started tokenizing...
... +200 embeddings done. Time spend: 47.97220587730408
... +200 embeddings done. Time spend: 68.89623188972473
... +200 embeddings done. Time spend: 93.31350135803223
... +200 embeddings done. Time spend: 64.3081693649292
... +200 embeddings done. Time spend: 133.89897298812866
... +200 embeddings done. Time spend: 99.9266312122345
... +200 embeddings done. Time spend: 120.8892412185669
... +200 embeddings done. Time spend: 85.44681406021118
... +200 embeddings done. Time spend: 71.72239995002747
... +200 embeddings done. Time spend: 63.632261514663696
... +200 embeddings done. Time spend: 62.7821900844574


In [32]:
all_article_embeddings.size()
all_article_embeddings2 = all_article_embeddings.clone()
for embedding in all_article_embeddings2:
    print(embedding[1])
    break


tensor(0.6551)


In [105]:
articles["Embedding"] = list(all_article_embeddings2.numpy())

In [106]:
for i in range(articles.shape[0]):
    articles["Embedding"][i] = str(list(articles["Embedding"][i]))

In [107]:
articles.head()

Unnamed: 0,Titles,Content,Embedding
0,Super_Bowl_50,Super Bowl 50 was an American football game to...,"[-0.18496414, 0.6551217, 0.20338096, -0.459793..."
1,Super_Bowl_50,The Panthers finished the regular season with ...,"[0.1856204, 0.48492086, -0.09557928, -0.699036..."
2,Super_Bowl_50,The Broncos took an early lead in Super Bowl 5...,"[-0.42100757, 0.6728958, 0.20258255, -0.928931..."
3,Super_Bowl_50,"CBS broadcast Super Bowl 50 in the U.S., and c...","[-0.03190776, 0.06583987, 0.24414025, -1.04467..."
4,Super_Bowl_50,"In early 2012, NFL Commissioner Roger Goodell ...","[-0.33727553, -0.006797987, -0.066093445, -1.0..."


In [108]:
articles.to_csv("data/articles.csv", sep="|")

In [66]:
print(len(q_train), len(q_test))

8456 2114


### Embeddings for TRAINING Set Questions

In [5]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')
model.eval()

print("Started tokenizing...")
all_q_train_embeddings = torch.randn(0)
start = time()

# Tokenize sentences
for i in range(q_train.shape[0] // 100 + 1):
    #print(i)
    encoded_input = tokenizer(list(q_train[i*100:min(i*100+100, q_train.shape[0])]), padding=True, truncation=True, return_tensors='pt')
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        q_train_embeddings = model_output[0][:, 0]
    # normalize embeddings
    q_train_embeddings = torch.nn.functional.normalize(q_train_embeddings, p=2, dim=1)
    all_q_train_embeddings = torch.cat((all_q_train_embeddings, q_train_embeddings), dim=0)
    
    if i % 5 == 0: 
        print("... +500 embeddings done. Time spend:", time() - start)
        start = time()
    #print("Sentence embeddings:", sentence_embeddings)
    #print(type(sentence_embeddings))



Started tokenizing...
... +500 embeddings done. Time spend: 1.7777063846588135
... +500 embeddings done. Time spend: 7.068681240081787
... +500 embeddings done. Time spend: 5.65240740776062
... +500 embeddings done. Time spend: 6.814623117446899
... +500 embeddings done. Time spend: 5.803053617477417
... +500 embeddings done. Time spend: 5.734717130661011
... +500 embeddings done. Time spend: 6.711751937866211
... +500 embeddings done. Time spend: 8.09401249885559
... +500 embeddings done. Time spend: 6.782918691635132
... +500 embeddings done. Time spend: 6.876634120941162
... +500 embeddings done. Time spend: 7.458440542221069
... +500 embeddings done. Time spend: 7.199315309524536
... +500 embeddings done. Time spend: 6.820048809051514
... +500 embeddings done. Time spend: 5.352373361587524
... +500 embeddings done. Time spend: 5.579969167709351
... +500 embeddings done. Time spend: 7.080826997756958
... +500 embeddings done. Time spend: 6.649754047393799


In [6]:
all_q_train_embeddings2 = all_q_train_embeddings.clone()

In [7]:
q_train_df = pd.DataFrame(data=[q_train, all_q_train_embeddings2]).T
q_train_df.columns = ["Question", "Embedding"]

In [15]:
for i in range(q_train_df.shape[0]):
    q_train_df["Embedding"][i] = str(list(q_train_df["Embedding"][i].numpy()))

In [16]:
q_train_df.head()

Unnamed: 0,Question,Embedding
0,Which NFL team represented the AFC at Super Bo...,"[-0.018374031, -0.02516927, 0.008701543, -0.08..."
1,Which NFL team represented the NFC at Super Bo...,"[-0.0065157767, -0.01013894, -0.016297542, -0...."
2,Where did Super Bowl 50 take place?,"[-0.017278872, -0.0012698264, 0.00483831, -0.1..."
3,Which NFL team won Super Bowl 50?,"[-0.0070417165, 0.008862875, 0.03743299, -0.08..."
4,What color was used to emphasize the 50th anni...,"[-0.0697444, 0.030223556, 0.0025628584, -0.076..."


In [17]:
q_train_df.to_csv("data/q_train_df.csv", sep="|", index=False)

### Embeddings for TEST Set Questions

In [18]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')
model.eval()

print("Started tokenizing...")
all_q_test_embeddings = torch.randn(0)
start = time()

# Tokenize sentences
for i in range(q_test.shape[0] // 100 + 1):
    #print(i)
    encoded_input = tokenizer(list(q_test[i*100:min(i*100+100, q_test.shape[0])]), padding=True, truncation=True, return_tensors='pt')
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        q_test_embeddings = model_output[0][:, 0]
    # normalize embeddings
    q_test_embeddings = torch.nn.functional.normalize(q_test_embeddings, p=2, dim=1)
    all_q_test_embeddings = torch.cat((all_q_test_embeddings, q_test_embeddings), dim=0)
    
    if i % 5 == 0:
        print("... +500 embeddings done. Time spend:", time() - start)
        start = time()
    #print("Sentence embeddings:", sentence_embeddings)
    #print(type(sentence_embeddings))



Started tokenizing...
... +500 embeddings done. Time spend: 1.3173024654388428
... +500 embeddings done. Time spend: 6.843916177749634
... +500 embeddings done. Time spend: 6.475721597671509
... +500 embeddings done. Time spend: 6.59215784072876
... +500 embeddings done. Time spend: 6.543044090270996


In [27]:
all_q_test_embeddings2 = all_q_test_embeddings.clone()

In [30]:
q_test_df = pd.DataFrame(data=[q_test, all_q_test_embeddings2]).T
q_test_df.columns = ["Question", "Embedding"]

In [31]:
for i in range(q_test_df.shape[0]):
    q_test_df["Embedding"][i] = str(list(q_test_df["Embedding"][i].numpy()))

In [32]:
q_test_df.to_csv("data/q_test_df.csv", sep="|", index=False)
q_test_df.head()

Unnamed: 0,Question,Embedding
0,Who became king in 1643?,"[-0.043172576, 0.06543647, 0.053647112, 0.0173..."
1,What was the first team Peyton Manning began p...,"[0.0154337315, 0.024577715, -0.019312672, -0.0..."
2,What was East and Central Africa's economy boo...,"[-0.096133806, 0.05678508, 0.08325848, 0.03994..."
3,How many buildings were razed by the Jacksonvi...,"[0.011810559, 0.02840623, 0.024501665, 0.00578..."
4,What year was the first manned flight with the...,"[-0.0056205806, 0.04971382, 0.030576339, -0.01..."


In [33]:
q_test_df["Embedding"][0]

'[-0.043172576, 0.06543647, 0.053647112, 0.017358074, -0.045584247, -0.036809586, 0.05786139, -0.02233542, 0.024832828, 0.06868153, -0.04989235, -0.081219316, 0.008540585, 0.017319588, 0.016747413, 0.008157038, -0.0729912, 0.0018113824, -0.08499822, 0.010650749, -0.023922112, -0.011038568, -0.019761417, -0.00036382923, -0.021959381, 0.03473124, -0.01353981, -0.0036404445, -0.0379686, -0.14944267, 0.010403878, -0.053879917, -0.0071740216, -0.027206777, 0.017003475, -0.03412664, -0.037623383, 0.09063295, 0.0030004436, 0.0020091769, 0.020919684, -0.009820057, 0.007557644, -0.021220984, 0.030739235, 0.00034525324, 0.044423718, 0.018096896, 0.06996212, 0.014564461, -0.039175212, 0.00954258, 0.029441247, -0.020658072, 0.04527804, 0.0014772442, 0.047443144, 0.043305106, 0.14558265, -0.027107194, 0.027091546, 0.046264607, -0.2052702, 0.05143712, -0.058162436, 0.029946428, 0.010979159, -0.03875284, -0.044451628, 0.028309936, -0.036944523, -0.02037264, 0.0104134735, -0.004894026, -0.032228902, 0

### Design Database

In [73]:
%pip install --upgrade --quiet  pgvector
%pip install --upgrade --quiet  langchain-openai
%pip install --upgrade --quiet  psycopg2-binary
%pip install --upgrade --quiet  tiktoken

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [74]:
# this is to import all necessary modules
import requests
import matplotlib.pyplot as plt
import openai, json, tiktoken, psycopg2, ast, pgvector, math, os, random

from langchain.vectorstores.pgvector import PGVector
from psycopg2.extras import execute_values
from sqlalchemy import create_engine
from time import time
#from pgvector.psycopg2 import register_vector
#%load_ext sql

In [94]:
conn_string = "host='localhost' dbname='imdb_top' user='postgres' password='test'"
conn = psycopg2.connect(conn_string)
cur = conn.cursor()

In [118]:
cur = conn.cursor()
cur.execute("ROLLBACK")
conn.commit()

In [119]:
query = """
    DROP TABLE IF EXISTS squad_articles;

    CREATE TABLE IF NOT EXISTS squad_articles (
        title_ID INTEGER,
        titles VARCHAR(200), 
        content TEXT,
        embedding VECTOR(384)
    );

    COPY squad_articles
    FROM '/articles.csv' 
    DELIMITER '|' CSV;
"""

In [120]:
cur.execute(query)
conn.commit()

In [88]:
for i in range(articles.shape[0]):
    if type(articles["Embedding"][i]) != type(np.array([])):
        print(type(articles["Embedding"][i]))
#print(articles.shape[0])

2067


In [89]:
articles

Unnamed: 0,Titles,Content,Embedding
0,Super_Bowl_50,Super Bowl 50 was an American football game to...,"[-0.18496414, 0.6551217, 0.20338096, -0.459793..."
1,Super_Bowl_50,The Panthers finished the regular season with ...,"[0.1856204, 0.48492086, -0.09557928, -0.699036..."
2,Super_Bowl_50,The Broncos took an early lead in Super Bowl 5...,"[-0.42100757, 0.6728958, 0.20258255, -0.928931..."
3,Super_Bowl_50,"CBS broadcast Super Bowl 50 in the U.S., and c...","[-0.03190776, 0.06583987, 0.24414025, -1.04467..."
4,Super_Bowl_50,"In early 2012, NFL Commissioner Roger Goodell ...","[-0.33727553, -0.006797987, -0.066093445, -1.0..."
...,...,...,...
2062,Force,"where is the mass of the object, is the velo...","[-0.3365962, -0.46805736, -0.02764538, 0.24325..."
2063,Force,A conservative force that acts on a closed sys...,"[-0.6067321, -0.30440584, 0.033170737, 0.32737..."
2064,Force,"For certain physical scenarios, it is impossib...","[-0.7290247, 0.0688544, -0.105723664, 0.220050..."
2065,Force,The connection between macroscopic nonconserva...,"[-0.366291, -0.14309838, -0.045113474, -0.0462..."
