### Before you start:

To run this code make sure you have the the libraries listed on requirements.txt on the same directory as this notebook.

I'd recommend you to use a virtual environment to run this code. 

Also, Make sure the Docker container is running and the servers are bootstrapped.

Let's get started!

In [1]:
import re
import polars as pl
from glob import glob
import requests
import os

In [2]:
# Constants
PARAGRAPH_SEP = "\n\n" # Paragraph separator

GPU_SERVER_URL = 'http://localhost:11434'
CPU_SERVER_URL = 'http://localhost:11435'

EMBEDDING_MODEL = "jina/jina-embeddings-v2-small-en"
MAIN_LLM = "llama3"

DELTA_LOCATION = "data/vector_store_delta"
CLEAN_DATA_FOLDER = True
N_CTX = 8192

# Define functions to interact with the models
def get_embeddings(text):
    response = requests.post(f'{CPU_SERVER_URL}/api/embeddings', json={
        "model": EMBEDDING_MODEL,
        "prompt": text
    })
    return response.json()['embedding']

def get_model_response(history, temperature=0.1):
    response = requests.post(f'{GPU_SERVER_URL}/api/chat', json={
        "model": "llama3",
        "messages": history,
        "options":{
            "temperature": temperature,
            "num_ctx": N_CTX,
        },
        "stream": False
    })
    response_json = response.json()
    llm_token_count = response_json.get('eval_count')
    prompt_token_count = response_json.get('prompt_eval_count')
    return response.json().get('message').get('content'), llm_token_count, prompt_token_count

In [3]:
import re

def approx_num_tokens_from_string(string: str) -> int:
    return len(string.split(" "))

def approx_sentense_split(text, chunk_target_size=1000, overlap=20, min_chunk_size=200, paragraph_sep='\n\n', chunking_regex='[^,\.;]+[,\.;]?', word_sep=' ') -> list[str]:
    # Step 1: Split by paragraph separator
    paragraphs = text.split(paragraph_sep)
    chunks = []
    
    for paragraph in paragraphs:
        # Step 2: Split by the second chunking regex
        matches = re.findall(chunking_regex, paragraph)
        
        # Group the matches into chunks
        current_chunk = []
        current_chunk_size = 0
        
        for match in matches:
            num_tokens = approx_num_tokens_from_string(match)
            if current_chunk_size + num_tokens <= chunk_target_size:
                current_chunk.append(match)
                current_chunk_size += num_tokens
            else:
                # Finalize the current chunk if it exceeds the target size
                if current_chunk:
                    chunks.append(word_sep.join(current_chunk))
                
                # Start a new chunk with overlap from the end of the previous chunk
                overlap_chunks = current_chunk[-overlap:] if len(current_chunk) >= overlap else current_chunk
                current_chunk = overlap_chunks + [match]
                current_chunk_size = sum(approx_num_tokens_from_string(c) for c in current_chunk)
        
        # Add the last chunk of the paragraph if not empty
        if current_chunk:
            chunks.append(word_sep.join(current_chunk))
    
    # Ensure no chunks are too small
    merged_chunks = []
    buffer_chunk = ''
    
    for chunk in chunks:
        if approx_num_tokens_from_string(chunk) < min_chunk_size:
            if buffer_chunk:
                buffer_chunk += word_sep + chunk
            else:
                buffer_chunk = chunk
            
            if approx_num_tokens_from_string(buffer_chunk) >= min_chunk_size:
                merged_chunks.append(buffer_chunk)
                buffer_chunk = ''
        else:
            if buffer_chunk:
                merged_chunks.append(buffer_chunk)
                buffer_chunk = ''
            merged_chunks.append(chunk)
    
    if buffer_chunk:
        if merged_chunks:
            merged_chunks[-1] += word_sep + buffer_chunk
        else:
            merged_chunks.append(buffer_chunk)
    
    return merged_chunks


In [4]:
CHUNK_SIZE = 1000

def save_vector_to_table(vectors, delta_path):
    print(f"Appending {len(vectors)} rows to the delta table")    
    df = pl.DataFrame(vectors)
    df.write_delta(delta_path, mode="append")
    
def build_vector_dataframe(file_root_path, delta_path):
    # Get the list of files
    files = glob(f"{file_root_path}/*.txt")
    # Process each file
    for file in files:
        print(f"Processing file {file}")
        vector_data = []
        with open(file, 'r') as f:
            text = f.read()
        # Clean the text a bit
        text = re.sub(r" +", " ", text) 
        text = re.sub(r"\n\n+", PARAGRAPH_SEP, text)
        # Split the text into chunks
        chunks = approx_sentense_split(text, CHUNK_SIZE)
        for i, chunk in enumerate(chunks):
            # Get the embeddings for the chunk
            vector = get_embeddings(chunk) 
            # The vector fetch could by improved to be done in parallel and using batching
            vector_data.append({'uri': file, 'text': chunk, 'vector': vector, 'chunk_id': i})
        # Append the vector to the delta table
        save_vector_to_table(vector_data, delta_path)

# Check if data folder exists at the root
data_folder = DELTA_LOCATION.split("/")[0]
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

if not os.path.exists(DELTA_LOCATION):
    # Build the delta table
    build_vector_dataframe("files_examples", DELTA_LOCATION)

In [5]:
def cosine_similarity(v1, v2):
    "Cossine similarity between two vectors"
    dot_product = sum(a * b for a, b in zip(v1, v2))
    magnitude = (sum(a ** 2 for a in v1) * sum(b ** 2 for b in v2)) ** 0.5
    return dot_product / magnitude

def change_orientation_dict(dict_data):
    result = []
    for i in range(len(list(dict_data.values())[0])):
        result.append({key: dict_data[key][i] for key in dict_data.keys()})
    return result

def query_vector_emb(query, delta_location, top_k=2, n_neighboors=1, min_similarity=0.75):
    df_emb = pl.scan_delta(delta_location)
    query_embedding = get_embeddings(query)
    df_q = (
        df_emb
            .with_columns(pl.lit(query_embedding).alias("query_embedding"))
            .with_columns(
                pl.struct(["query_embedding", "vector"])
                .map_elements(
                    lambda cols: cosine_similarity(cols["query_embedding"], cols["vector"]),
                    return_dtype=float
                )
                .alias("similarity")
            )
            .filter(pl.col("similarity") > min_similarity)
    )

    df_q_results = (
        df_q.sort("similarity", descending=True).select(
            pl.col("uri"),
            pl.col("chunk_id").alias("similar_chunk_id"),
            pl.col("similarity").alias("similarity"),
        )
        .limit(top_k)
    )
    
    df_q_results = (
        df_q_results
        .join(
            df_emb,
            on="uri"
        ).filter(
            (pl.col("chunk_id") == pl.col("similar_chunk_id")) |
            (pl.col("chunk_id") >= pl.col("similar_chunk_id") - n_neighboors) &
            (pl.col("chunk_id") <= pl.col("similar_chunk_id") + n_neighboors)
        ).select(
            pl.col("uri"),
            pl.col("text"),
            pl.col("similar_chunk_id").alias("center_chunk_id"),
            pl.col("chunk_id"),
            pl.col("vector"),
            pl.col("similarity")
        )
    )

    # Retain the original scores, this can be used later for the UI
    df_q_scores = df_q_results.select(
        pl.col("uri"),
        pl.col("center_chunk_id"),
        pl.col("chunk_id"),
        pl.col("text"),
        pl.col("similarity")
    )

    # Now group by the uri and get the text on the correct order
    df_q_context = df_q_results.select(
        pl.col("uri"),
        pl.col("chunk_id"),
        pl.col("text")
    ).unique().sort(["uri", "chunk_id"], descending=False)
    df_q_context = df_q_context.group_by(["uri"], maintain_order=True).agg(
        pl.col("text").str.concat("")
    )
    
    # Convert to dictionary    
    dict_context = df_q_context.collect().to_dict(as_series=False)
    dict_detail = df_q_scores.collect().to_dict(as_series=False)
    
    return change_orientation_dict(dict_context), change_orientation_dict(dict_detail)



In [6]:
def make_keywords(query):
  SYSTEM_PROMPT = f"""You are a helpful assistant that has access to pre-fetched query results.
  Your job is to transform the user query into keywords separated by commas.

  To ensure a good job follow the instructions:
      1. Do not include any stopwords in the keywords.
      2. Make sure to include the most important keywords first.
      3. Do not explain the keywords, just list them.
  """
  history = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": f"Why is the sky blue?"},
    {"role": "assistant", "content": "sky color explanation"},
    {"role": "user", "content": f"How does GPU and CPU differ?"},
    {"role": "assistant", "content": "GPU, CPU, difference"},
    {"role": "user", "content":query}
  ]
  text, _, _ = get_model_response(history, 0.2)
  return [x.strip() for x in text.split(",")]

def get_response(query, delta_location):
    keywords = make_keywords(query)
    emb_similarity, detail = query_vector_emb(', '.join(keywords), delta_location, n_neighboors=1)
    
    SYSTEM_PROMPT = """You are an expert Q&A system that is trusted around the world.
    Always answer the query using the provided context information, and not prior knowledge.
    Some rules to follow:
    1. Never directly reference the given context in your answer.
    2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
    3. When answering based on the context, always provide references from the context information.
    4. Please format your answer in markdown."""
    USER_PROMPT = """Context information is below.
    ---------------------
    [CONTEXT]
    ---------------------
    Given the context information and NOT prior knowledge, answer the query.
    Query: [QUERY]
    """

    # Fetch the context information
    
    final_context = ""
    for sub_context in emb_similarity:
        final_context += f"URI: {sub_context['uri']}\nText: {sub_context['text']}\n\n"
    USER_PROMPT = USER_PROMPT.replace("[CONTEXT]", final_context)
    USER_PROMPT = USER_PROMPT.replace("[QUERY]", query)
    history = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ]
    response, llm_token_count, prompt_token_count = get_model_response(history, 0.7)
    return {
      'llm_response':response,
      'keywords_used': keywords, 
      'search_details':detail, 
      'llm_token_count':llm_token_count,
      'prompt_token_count':prompt_token_count,
      'warn': prompt_token_count > N_CTX
    }

In [7]:
def print_response(response):
    print('--- LLM Response ---')
    print(response.get('llm_response'))
    print('--- SEARCH ENGINE SUMMARY ---')
    keywords_used = response.get('keywords_used')
    search_details = response.get('search_details')
    chunks_used = len(search_details)
    center_chunks_details = set(
        [
            f"uri: {x['uri']}, center_chunk: {x['center_chunk_id']}, similarity: {x['similarity']}" for x in search_details
        ]
    )
    print('Keywords used:', keywords_used)
    print('Chunks used:', chunks_used, 'over the context of:')
    for detail in center_chunks_details:
        print('\t', detail)
    print('--- LLM USAGE SUMMARY ---')
    print('Prompt token count:', response.get('prompt_token_count'))
    print('LLM token count:', response.get('llm_token_count'))

In [8]:
query = 'I want to make Apple Pasta Salad, what are the ingredients?'
response = get_response(query, DELTA_LOCATION)
print_response(response)

--- LLM Response ---
**Apple Pasta Salad Recipe Ingredients**

According to the provided context information, the ingredients for the **Apple Pasta Salad Recipe** are:

1. 1 container (8 ounce) plain nonfat yogurt
2. 2 cups uncooked rotini pasta
3. 1 (8 ounce) can unsweetened crushed pineapple - undrained
4. 1/2 cup shredded carrot
5. 1 cup sliced celery
6. 1/2 teaspoon salt - optional
7. 1/4 cup sliced green onions
8. 1/4 teaspoon garlic powder
9. 1/4 cup raisins
10. 1/4 teaspoon dry mustard
11. 3 cups diced unpeeled York or Winesap apples
12. 1 teaspoon finely chopped crystallized ginger
13. 1 tablespoon honey

These ingredients are listed in the context information as part of the **Apple Pasta Salad Recipe**.
--- SEARCH ENGINE SUMMARY ---
Keywords used: ['Apple pasta salad', 'ingredients', 'recipe']
Chunks used: 6 over the context of:
	 uri: files_examples\Pasta.txt, center_chunk: 1, similarity: 0.8400889960039423
	 uri: files_examples\Pasta.txt, center_chunk: 6, similarity: 0.84232

In [9]:
query = 'Explain what is PADI and PADO and their differences and similarities.'
response = get_response(query, DELTA_LOCATION)
print_response(response)

--- LLM Response ---
**PPP Over Ethernet Discovery Process**

In the PPP Over Ethernet (PPPoE) protocol, two packets play a crucial role in initiating a PPP session: **PADI** (PPP Over Ethernet Active Discovery Initiation) and **PADO** (PPP Over Ethernet Active Discovery Offer).

### PADI (PPP Over Ethernet Active Discovery Initiation)

A Host sends a PADI packet to the broadcast address (`DESTINATION_ADDR` set to the broadcast address). The packet has:

* `CODE` field set to 0x09
* `SESSION_ID` set to 0x0000
* Exactly one TAG of type `Service-Name`, indicating the service requested by the Host

The PADI packet is used to initiate a PPP session, and it must contain exactly one TAG of type `Service-Name`.

### PADO (PPP Over Ethernet Active Discovery Offer)

When an Access Concentrator receives a PADI packet that it can serve, it replies with a PADO packet. The packet has:

* `DESTINATION_ADDR` set to the unicast address of the Host that sent the PADI
* `CODE` field set to 0x07
* `SESSI