<a href="https://colab.research.google.com/github/ernanhughes/ollama-notes/blob/main/notebooks/rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pymupdf

def extract_text_from_pdf(pdf_path):
    text = ''
    with pymupdf.open(pdf_path) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

In [2]:
text = extract_text_from_pdf("2005.11401v4.pdf")
print(len(text))


69078


In [3]:
def chunk_text(text, chunk_size, overlap=0):
  """
  Splits text into chunks of specified length, ensuring words are not split.

  Args:
    text: The text to split.
    chunk_size: The desired size of each chunk.
    overlap: The number of characters to overlap between chunks. Defaults to 0.

  Returns:
    A list of text chunks.
  """

  chunks = []
  start = 0
  while start < len(text):
    # Find the next word boundary within the chunk size
    end = start + chunk_size
    if end >= len(text):
      end = len(text)
    else:
      while end > start and text[end] != " " and end > start + overlap:
        end -= 1

    chunks.append(text[start:end])
    start = end + overlap

  return chunks


In [4]:
chunks = chunk_text(text, chunk_size=1024, overlap=128)
ids = []
for i, chunk in enumerate(chunks):
    ids.append(f"chunk_{i}")

print(len(chunks))


61


In [5]:
import requests
import json
OLLAMA_EMBEDDING_MODEL="mxbai-embed-large"
OLLAMA_BASE_URL='http://127.0.0.1:11434'

def generate_embeddings(text, model_name: str = OLLAMA_EMBEDDING_MODEL,
                        base_url: str = OLLAMA_BASE_URL):
    """Generate embeddings for the given text using the specified model."""
    try:
        url = f"{base_url}/api/embeddings"
        data = {
            "prompt": text,
            "model": model_name
        }
        response = requests.post(url, json=data)
        if response.status_code == 200:
            return response.json()["embedding"]
        else:
            print(f"Failed to generate embeddings. Status code: {response.status_code}")
            print("Response:", response.text)
            return None
    except requests.ConnectionError:
        print("Failed to connect to the Ollama server. Make sure it is running locally and the URL is correct.")
        return None
    except json.JSONDecodeError:
        print("Failed to parse JSON response from Ollama server.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [6]:
from sqlite3 import connect
import sqlite_vec
db_file = "rag.db"
cn =connect(db_file)
cur = cn.cursor()
cn.enable_load_extension(True)
sqlite_vec.load(cn)
cn.enable_load_extension(False)
ver = cur.execute("select vec_version()").fetchone()
ver


('v0.1.3',)

In [7]:

embeddings_len = 1024
print(f"Length of embeddings: {embeddings_len}")
create_table = f"""
DROP TABLE IF EXISTS DOCUMENT_FTS;
CREATE VIRTUAL TABLE DOCUMENT_FTS USING fts5(id UNINDEXED, content, tokenize="porter unicode61");

DROP TABLE IF EXISTS DOCUMENT_VECTOR;
CREATE VIRTUAL TABLE DOCUMENT_VECTOR 
USING vec0(id INTEGER PRIMARY KEY, embedding float[{embeddings_len}]);

DROP TABLE IF EXISTS DOCUMENT_LOOKUP;
CREATE TABLE DOCUMENT_LOOKUP (id INTEGER PRIMARY KEY, content TEXT);
"""
cur.executescript(create_table)
cn.commit()


Length of embeddings: 1024


In [8]:
import numpy as np
def serialize_f32(vec):
    return np.array(vec, dtype=np.float32).tobytes()

In [11]:
from tqdm import tqdm

cur.execute("DELETE FROM DOCUMENT_FTS;")
cur.execute("DELETE FROM DOCUMENT_VECTOR;")
cur.execute("DELETE FROM DOCUMENT_LOOKUP;")

i = 0
for chunk in tqdm(chunks):
    i += 1
    embedding = generate_embeddings(chunk)
    cur.execute("INSERT INTO DOCUMENT_FTS(id, content) VALUES (?, ?)", (i, chunk))
    cur.execute("INSERT INTO DOCUMENT_VECTOR(id, embedding) VALUES (?, ?)", (i, serialize_f32(embedding)))
    cur.execute("INSERT INTO DOCUMENT_LOOKUP(id, content) VALUES (?, ?)", (i, chunk))
cn.commit()


100%|██████████| 61/61 [00:02<00:00, 21.77it/s]


In [14]:
def reciprocal_rank_fusion(fts_results, vec_results, k=60):  
    rank_dict = {}  
  
    # Process FTS results  
    for rank, (id,) in enumerate(fts_results):  
        if id not in rank_dict:  
            rank_dict[id] = 0  
        rank_dict[id] += 1 / (k + rank + 1)  
  
    # Process vector results  
    for rank, (rowid, distance) in enumerate(vec_results):  
        if rowid not in rank_dict:  
            rank_dict[rowid] = 0  
        rank_dict[rowid] += 1 / (k + rank + 1)  
  
    # Sort by RRF score  
    sorted_results = sorted(rank_dict.items(), key=lambda x: x[1], reverse=True)  
    return sorted_results 
  
def or_words(input_string):  
    # Split the input string into words  
    words = input_string.split()  
      
    # Join the words with ' OR ' in between  
    result = ' OR '.join(words)  
      
    return result

def lookup_row(id):
    row_lookup = cur.execute('''  
    SELECT content FROM DOCUMENT_LOOKUP WHERE id = ?
    ''', (id,)).fetchall()  
    content = ''
    for row in row_lookup:
        content= row[0]
        break
    return content

In [None]:
import pandas as pd
fts_search_query = "RAG" 
top_k = 5 

fts_results = cur.execute('''  
  SELECT id FROM DOCUMENT_FTS WHERE DOCUMENT_FTS MATCH ? 
  ORDER BY rank limit 5  
''', (or_words(fts_search_query),)).fetchall()  
  
# Vector search query  
query_embedding = generate_embeddings(fts_search_query)  
vec_results = cur.execute('''  
    SELECT id, distance FROM DOCUMENT_VECTOR 
    WHERE embedding MATCH ? and K = ?  
    ORDER BY distance  
''', [serialize_f32(query_embedding), top_k]).fetchall()  
  
# Combine results using RRF  
combined_results = reciprocal_rank_fusion(fts_results, vec_results)  
  
df = pd.DataFrame(combined_results, columns=['id', 'score'])
df
# Print combined results  
# for id, score in combined_results:  
#     print(f'| RRF Score: {score} |ID: {id}| Content: {lookup_row(id)}')  


| RRF Score: 0.032266458495966696 |ID: 23| Content: otland is Pound sterling.
RAG-T Pound is the currency needed in Scotland.
RAG-S The currency needed in Scotland is the pound sterling.
Jeopardy
Question
Gener
-ation
Washington
BART
?This state has the largest number of counties in the U.S.
RAG-T It’s the only U.S. state named for a U.S. president
RAG-S It’s the state where you’ll ﬁnd Mount Rainier National Park
The Divine
Comedy
BART
*This epic poem by Dante is divided into 3 parts: the Inferno, the Purgatorio & the Purgatorio
RAG-T Dante’s "Inferno" is the ﬁrst part of this epic poem
RAG-S This 14th century work is divided into 3 sections: "Inferno", "Purgatorio" & "Paradiso"
For 2-way classiﬁcation, we compare against Thorne and Vlachos [57], who train RoBERTa [35]
to classify the claim as true or false given the gold evidence sentence. RAG achieves an accuracy
within 2.7% of this model, despite being supplied with only the claim and retrieving its own evidence.
We also analyze whe