In [1]:
import os
from basic_rag_utils.helper import get_openai_api_key, get_gemini_api_key
# Set the "OPENAI_API_KEY" in the Python environment. Will be used by OpenAI client later.
OPENAI_API_KEY = get_openai_api_key()
GEMINI_API_KEY = get_gemini_api_key()

In [2]:
# False: Generate the embedding for the dataset. (Associated cost with using OpenAI endpoint)
# True: Load the dataset that already has the embedding vectors.
load_embedding = False

# Load Dataset


## Download Dataset (JSON)

The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model.

In [3]:
#!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv
#!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles-with_embeddings.csv

## Read File

In [4]:
# Split the input text into chunks of specified size.
def split_into_chunks(text, chunk_size=1024):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i : i + chunk_size])

    return chunks

In [5]:
import csv

chunks = []

# Load the file as a CSV
with open("./mini-llama-articles.csv", mode="r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)

    for idx, row in enumerate(csv_reader):
        if idx == 0:
            continue
            # Skip header row
        chunks.extend(split_into_chunks(row[1]))

In [None]:
print("number of articles:", idx)
print("number of chunks:", len(chunks))

In [None]:
import pandas as pd

# Convert the JSON list to a Pandas Dataframe
df = pd.DataFrame(chunks, columns=["chunk"])

df.keys()

In [8]:
from openai import OpenAI

client = OpenAI()


# Defining a function that converts a text to embedding vector using OpenAI's Ada model.
def get_embedding(text):
    try:
        # Remove newlines
        text = text.replace("\n", " ")
        res = client.embeddings.create(input=[text], model="text-embedding-3-small")

        return res.data[0].embedding

    except:
        return None

In [None]:
#from tqdm.notebook import tqdm
#import numpy as np
#
## Generate embedding
#if not load_embedding:
#    print("Generating embeddings...")
#    embeddings = []
#    for index, row in tqdm(df.iterrows()):
#        # df.at[index, 'embedding'] = get_embedding( row['chunk'] )
#        embeddings.append(get_embedding(row["chunk"]))
#
#        
#    # Add the embedding column to the dataframe.
#    embeddings_values = pd.Series(embeddings)
#    df.insert(loc=1, column="embedding", value=embeddings_values)
#
## Or, load the embedding from the file.
#else:
#    print("Loaded the embedding file.")
#    # Load the file as a CSV
#    df = pd.read_csv("mini-llama-articles-with_embeddings.csv")
#    # Convert embedding column to an array
#    df["embedding"] = df["embedding"].apply(lambda x: np.array(eval(x)), 0)

### User Question

In [None]:
# Define the user question, and convert it to embedding.
QUESTION = "How many parameters LLaMA2 model has?"
QUESTION_emb = get_embedding(QUESTION)

len(QUESTION_emb)

### Test Cosine Similarity

Calculating the similarity of embedding representations can help us to find pieces of text that are close to each other. In the following sample you see how the Cosine Similarity metric can identify which sentence could be a possible answer for the given user question. Obviously, the unrelated answer will score lower.

In [None]:
BAD_SOURCE_emb = get_embedding("The sky is blue.")
GOOD_SOURCE_emb = get_embedding("LLaMA2 model has a total of 2B parameters.")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# A sample that how a good piece of text can achieve high similarity score compared
# to a completely unrelated text.
print("> Bad Response Score:", cosine_similarity([QUESTION_emb], [BAD_SOURCE_emb]))
print("> Good Response Score:", cosine_similarity([QUESTION_emb], [GOOD_SOURCE_emb]))

### Calculate Cosine Similarities

In [None]:
# The similarity between the questions and each part of the essay.
cosine_similarities = cosine_similarity([QUESTION_emb], df["embedding"].tolist())

print(cosine_similarities)

In [None]:
import numpy as np

number_of_chunks_to_retrieve = 3

# Sort the scores
highest_index = np.argmax(cosine_similarities)

# Pick the N highest scored chunks
indices = np.argsort(cosine_similarities[0])[::-1][:number_of_chunks_to_retrieve]
print(indices)

In [None]:
# Look at the highest scored retrieved pieces of text
for idx, item in enumerate(df.chunk[indices]):
    print(f"> Chunk {idx+1}")
    print(item)
    print("----")