In [2]:
import json
from openai import OpenAI
from dotenv import load_dotenv
import os
import requests
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import re
import os
import json
import glob
import hashlib
import pandas as pd

In [2]:
# def clean_html(html_content):
#     soup = BeautifulSoup(html_content, 'html.parser')
#     return soup.get_text(separator=' ', strip=True)

In [3]:
# def clean_text(text):
#     # Remove extra whitespace
#     text = re.sub(r'\s+', ' ', text)
#     # Remove special characters
#     text = re.sub(r'[^\w\s]', '', text)
#     return text.strip().lower()

# Ingestion

In [4]:
def generate_document_id(doc):
    combined = f"{doc['title']}-{doc['url']}-{doc['html'][:50]}"
    hash_object = hashlib.md5(combined.encode())
    return hash_object.hexdigest()[:8]

In [5]:
# def chunk_text(text, chunk_size=1000):
#     return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]


def chunk_text(text, chunk_size=500, overlap_size=20):
    words = text.split()
    chunks = []
    start = 0
    text_length = len(words)
    while start < text_length:
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap_size
        if end >= text_length:
            break
    return chunks

In [6]:
# def clean_html_content(html):
#     # Remove the common header/navigation content
#     cleaned_text = re.sub(r'^.*?Powered by GitBook', '', html, flags=re.DOTALL)
    
#     # Remove any remaining navigation-like content at the end
#     cleaned_text = re.sub(r'Previous.*?Last updated.*?$', '', cleaned_text, flags=re.DOTALL)
    
#     # Remove extra whitespace and newlines
#     cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
#     return cleaned_text


import re

def clean_html_content(html):
    # Remove the common header/navigation content
    cleaned_text = re.sub(r'^.*?Powered by GitBook', '', html, flags=re.DOTALL)
    cleaned_text = re.sub(r'^.*?Terms of Service', '', html, flags=re.DOTALL)
    cleaned_text = re.sub(r'^.*?Disclaimer', '', html, flags=re.DOTALL)
    cleaned_text = re.sub(r'Previous.*?Last updated.*?$', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'\*\*', '', cleaned_text) 
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', cleaned_text)
    cleaned_text = re.sub(r'@\w+', '', cleaned_text) 
    cleaned_text = re.sub(r'<[^>]+>', '', cleaned_text)
    cleaned_text = re.sub(r'\\n', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text


In [7]:
def process_documents(documents):
    processed_docs = []
    irrelevant_titles = ['Terms of Use', 'Contact us', 'Disclaimer', 'Terms of Service']
    
    for doc in documents:
        # Skip irrelevant pages
        if any(title.lower() in doc['title'].lower() for title in irrelevant_titles):
            continue
        
        # Clean HTML content
        text = clean_html_content(doc['html'])
        title = clean_html_content(doc['title'])
        # Generate document ID
        doc_id = generate_document_id(doc)

        # processed_docs.append({
        #     'doc_id': doc_id,
        #     'chunk_id': doc_id,  # Use doc_id as chunk_id for consistency
        #     'text': text,
        #     'title': doc['title'],
        #     'url': doc['url'],
        #     'source': doc['source']
        # })
        
        # Chunk the text
        chunks = chunk_text(text)
        
        for i, chunk in enumerate(chunks):
            chunk_id = f"{doc_id}_{i}"
            processed_docs.append({
                'doc_id': doc_id,
                'chunk_id': chunk_id,
                'text': chunk,
                'title': title,
                'url': doc['url'],
                'source': doc['source']
            })
    
    return processed_docs

In [8]:
def load_documents(directory_path):
    documents = []
    json_files = glob.glob(os.path.join(directory_path, '*.json'))

    for file_path in tqdm(json_files, desc="Loading JSON files"):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
        except UnicodeDecodeError:
            with open(file_path, 'r', encoding='cp1252') as file:
                data = json.load(file)
        
        if isinstance(data, list):
            # Existing structure
            for doc in data:
                documents.append({
                    'html': doc.get('html', ''),
                    'title': doc.get('title', ''),
                    'url': doc.get('url', ''),
                    'source': f"json/{os.path.basename(file_path)}"
                })
        elif isinstance(data, dict) and 'messages' in data:
            # New structure with messages/content
            for message in data['messages']:
                content = message.get('content', '')
                if content:
                    documents.append({
                        'html': content,
                        'title': f"Message from {message.get('author', {}).get('name', 'Unknown')}",
                        'url': '',
                        'source': f"json/{os.path.basename(file_path)}"
                    })
        else:
            print(f"Unsupported JSON structure in file: {file_path}")
    
    return documents, json_files

current_directory = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'json'))

print(f"Using directory: {current_directory}")

# Load the JSON data
try:
    documents, json_files = load_documents(current_directory)
    print(f"Successfully loaded {len(documents)} documents from {len(json_files)} JSON files.")

    processed_documents = process_documents(documents)
    print(f"Created {len(processed_documents)} chunks from {len(documents)} documents.")
    
    sources = set(doc['source'] for doc in documents)
    print(f"Unique sources: {', '.join(sources)}")
except Exception as e:
    print(f"An error occurred while processing the documents: {e}")
    processed_documents = []

Using directory: C:\Users\dimi\Desktop\parthenon-rag\data\json


Loading JSON files:   0%|          | 0/44 [00:00<?, ?it/s]

Successfully loaded 561 documents from 44 JSON files.
Created 653 chunks from 561 documents.
Unique sources: json/routex-1.json, json/echelonmarket-1.json, json/Movement - ──── Movement Info - 🏛┃testnet-guide [1267717617339072553].json, json/Seekers Alliance - │faqs [1114042684370321449].json, json/brkt-brings-gamblefi-1.json, json/securing-smart-contracts-a-devs-guide-part-ii-1.json, json/movewiffrens-1.json, json/Satay 2.0 - ✧ Welcome ✧ - about-us [1258261708011470900].json, json/movement-sdk-unifying-the-blockchain-universe-2-1.json, json/binance-labs-backs-movement-labs-mission-1.json, json/Seekers Alliance - bounty-board [1114081267147882556].json, json/wikiinfiniteseas2-1.json, json/henrysocial-1.json, json/satayfinance-1.json, json/BRKT - BRKT TEAM - 📋-start-here [1258738649864994836].json, json/omnibtclabs-1.json, json/stablejack-1.json, json/Satay 2.0 - ✧ Welcome ✧ - start-here [1262316226873524254].json, json/MovementCommunityProgram-1.json, json/WarpGate_Official - galxe-que

**RAG flow**

In [4]:
# Load environment variables from .env file
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key=api_key)

**Index and search**

In [10]:
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import NotFoundError

# Initialize Elasticsearch client
es_client = Elasticsearch('http://localhost:9200')
print(es_client.info())


index_name = "movement-wiki"
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "doc_id": {"type": "keyword"},
            "chunk_id": {"type": "keyword"},
            "text": {"type": "text"},
            "title": {"type": "text"},
            "url": {"type": "keyword"},
            "source": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}
# Delete existing index if it exists
try:
    response = es_client.indices.delete(index=index_name)
    print(f"Deleted index: {index_name}")
except NotFoundError:
    print(f"Index {index_name} not found, nothing to delete")

# Create new index
es_client.indices.create(index=index_name, settings=index_settings['settings'], mappings=index_settings['mappings'])

print(f"Created index: {index_name}")

{'name': '1f8253bbe4fa', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'SmChPidnTz6tTnDxFHIgSQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
Deleted index: movement-wiki
Created index: movement-wiki


In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L12-v2")



In [12]:
from elasticsearch.helpers import bulk
from tqdm.auto import tqdm

def index_documents(documents, batch_size=500):
    actions = []
    for doc in tqdm(documents, desc="Indexing documents"):
        try:
            text_vector = model.encode(doc['text']).tolist()
        except Exception as e:
            print(f"Error encoding document {doc['doc_id']}: {e}")
            continue

        action = {
            '_op_type': 'index',
            '_index': index_name,
            '_source': {
                'doc_id': doc['doc_id'],
                'chunk_id': doc['chunk_id'],
                'text': doc['text'],
                'title': doc['title'],
                'url': doc['url'],
                'source': doc['source'],
                'text_vector': text_vector
            }
        }
        actions.append(action)

        if len(actions) >= batch_size:
            bulk(es_client, actions)
            actions = []  # Clear actions list after bulk index

    # Index remaining documents if any
    if actions:
        bulk(es_client, actions)

    print(f"Indexed {len(documents)} documents")

index_documents(processed_documents)

# Refresh the index after all documents are indexed
es_client.indices.refresh(index=index_name)
print("Index refreshed")


Indexing documents:   0%|          | 0/653 [00:00<?, ?it/s]

Indexed 653 documents
Index refreshed


## Text Search

In [13]:
def text_search(query, size=5, source=None):
    search_query = {
        "bool": {
            "must": [
                {
                    "multi_match": {
                        "query": query,
                        "fields": ["title", "text^3"],
                        "type": "best_fields",
                        "fuzziness": "AUTO"
                    }
                }
            ],
            "should": [
                {
                    "match_phrase": {
                        "text": {
                            "query": query,
                            "boost": 2
                        }
                    }
                }
            ]
        }
    }
    if source:
        search_query["bool"]["filter"] = {
            "term": {
                "source": source
            }
        }

    response = es_client.search(
        index=index_name,
        query=search_query,
        size=size,
        _source=["doc_id", "chunk_id", "text", "title", "url", "source"]
    )
    return [hit['_source'] for hit in response['hits']['hits']]

## Vector Search

### KNN

In [14]:
def elastic_search_knn(field, query, source=None):
    query_vector = model.encode(query).tolist()

    knn = {
        "field": field,
        "query_vector": query_vector,
        "k": 5,
        "num_candidates": 10000
    }

    if source:
        knn["filter"] = {
            "term": {
                "source": source
            }
        }

    search_query = {
        "knn": knn,
        "_source": ["doc_id", "chunk_id", "text", "title", "url", "source"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    # Extract and return the documents
    return [hit['_source'] for hit in es_results['hits']['hits']]

### Hybrid Search

In [15]:
def elastic_search_hybrid(query, vector, size=5):
    knn_query = {
        "field": "text_vector",
        "query_vector": vector,
        "k": size,
        "num_candidates": 10000,
        "boost": 0.5
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["text^3", "title"],
                    "type": "best_fields",
                    "boost": 0.5,
                    "fuzziness": "AUTO"
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": size,
        "_source": ["doc_id", "chunk_id", "text", "title", "url", "source"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    return [hit['_source'] for hit in es_results['hits']['hits']]

In [16]:
def hybrid_search(query, size=5):
    query_vector = model.encode(query).tolist()
    return elastic_search_hybrid(query, query_vector, size)

## Build prompt

In [17]:
prompt_template = """
You are an AI-powered Assistant for Movement Labs, specializing in the Move language and the Movement Network ecosystem. 
Answer the QUESTION based strictly on the CONTEXT from the knowledge base. If the CONTEXT does not provide enough details, request more information or clarify the question. 

Your answer should be clear, concise, and factual. Follow these guidelines:
- Provide a complete answer in 2-3 short paragraphs or bullet points for clarity.
- Focus on the most relevant information.
- If the QUESTION is unclear, ask for clarification.
- Do not speculate or generate information not present in the CONTEXT.
- Ensure your response is complete and not cut off mid-sentence.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
Document ID: {doc_id}
Chunk ID: {chunk_id}
Title: {title}
URL: {url}
Content: {text}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context += entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt, context

In [18]:
def llm(prompt, model='gpt-4o-mini', max_tokens=150):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_tokens
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [19]:
# query = 'What functions does the ContractRouter smart contract serve in relation to participant registration??'
# size=3
# model='gpt-4o-mini'
# source=None

In [20]:
# # Perform the search
# search_results = text_search(query, 5, source)

# # Debug: Print the structure of search_results
# print("Structure of search_results:")
# print(json.dumps(search_results[0] if search_results else {}, indent=2))

# # Adjust the code based on the actual structure
# # Assuming search_results is a list of dictionaries with the document information

# # Build the prompt and get the answer
# prompt, context = build_prompt(query, search_results)
# answer = llm(prompt, model=model, max_tokens=250)

# # Print the results
# print(f"\nQuery: {query}")
# print(f"\nAnswer: {answer}")

# print("\nSearch results:")
# for doc in search_results:
#     print(f"Doc ID: {doc.get('doc_id', 'N/A')}")
#     print(f"Chunk ID: {doc.get('chunk_id', 'N/A')}")
#     print(f"Title: {doc.get('title', 'N/A')}")
#     print(f"Source: {doc.get('source', 'N/A')}")
#     print(f"URL: {doc.get('url', 'N/A')}")
#     print(f"Text snippet: {doc.get('text', 'N/A')[:100]}...")  # First 100 characters of the text
#     print("-" * 50)

# # Print the full context used for the answer
# print("\nFull context used:")
# print(context)

In [21]:

# print(f"Answer: {answer}")

In [22]:
# print(f"prompt: {prompt}")

**Ground truth**

In [23]:
def generate_ground_truth(documents, num_questions=3):
    ground_truth = []
    for doc in tqdm(documents, desc="Generating ground truth"):
        prompt = f"""
        You are a curious user. Based on the following document, generate {num_questions} questions that can be answered using the information provided.
        Make the questions specific to the content and avoid general questions.
        
        Document:
        
        Title: {doc['title']}
        Content: {doc['text']}
        Provide the output as parsable JSON without using code blocks:
        
        {{"questions": ["question1", "question2", ..., "question{num_questions}"]}}
        """
        response = llm(prompt, model='gpt-4o-mini', max_tokens=1000)
        try:
            parsed_response = json.loads(response)
            questions = parsed_response.get('questions', [])
            for question in questions:
                ground_truth.append({
                    'question': question,
                    'doc_id': doc['doc_id'],
                    'chunk_id': doc['chunk_id']
                })
        except json.JSONDecodeError:
            print(f"Error parsing response for document {doc['doc_id']}")
            print("Raw response:", response)
    
    return ground_truth

In [24]:
# Generate ground truth
processed_documents = process_documents(documents)
ground_truth = generate_ground_truth(processed_documents, num_questions=3)

Generating ground truth:   0%|          | 0/653 [00:00<?, ?it/s]

Error parsing response for document afabb682
Raw response: {"questions": ["What are the three possible outcomes when submitting batch transactions and their corresponding HTTP status codes?", "What is the required format for the 'sender' field in the SubmitTransactionRequest when submitting a transaction?", "What is the purpose of the 'expiration_timestamp_secs' field in the transaction request?"']}
Error parsing response for document 5717ea40
Raw response: {"questions":["What does the useAptosAccountBalance hook return when fetching the account balance?","What happens to the balance value if there is an error during the fetch operation?","Which type should the balance be converted to for calculations if it exceeds Number.MAX_SAFE_INTEGER?"asi
Error parsing response for document c1474180
Raw response: {"questions":["What unique benefits does xAVAX provide compared to traditional leveraged contracts?","How does xAVAX enable users to gain exposure to AVAX price movements without the risk

In [25]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
os.makedirs(data_dir, exist_ok=True)

csv_path = os.path.join(data_dir, 'ground-truth-retrieval.csv')

In [26]:
df = pd.DataFrame(ground_truth)
df.to_csv(csv_path, index=False)
print(f"Ground truth saved to '{csv_path}'")

Ground truth saved to 'C:\Users\dimi\Desktop\parthenon-rag\data\ground-truth-retrieval.csv'


**Retrieval evaluation**

In [27]:
df_question = pd.read_csv(csv_path)
print(df_question.head())

                                            question    doc_id    chunk_id
0      What type of protocol is Avitus described as?  a88e0ac9  a88e0ac9_0
1  What unique feature does Avitus offer regardin...  a88e0ac9  a88e0ac9_0
2       Where can users test the features of Avitus?  a88e0ac9  a88e0ac9_0
3    What is the link to the Avitus Twitter account?  df283f30  df283f30_0
4             Where can I find the Avitus Brand Kit?  df283f30  df283f30_0


In [28]:
ground_truth = df_question.to_dict(orient='records')

In [29]:
ground_truth[0]

{'question': 'What type of protocol is Avitus described as?',
 'doc_id': 'a88e0ac9',
 'chunk_id': 'a88e0ac9_0'}

In [30]:
def hit_rate(relevance_list):
    return sum(1 for item in relevance_list if any(item)) / len(relevance_list)

def mrr(relevance_list):
    reciprocal_ranks = []
    for item in relevance_list:
        try:
            first_relevant = next(i for i, rel in enumerate(item, 1) if rel)
            reciprocal_ranks.append(1 / first_relevant)
        except StopIteration:
            reciprocal_ranks.append(0)
    return sum(reciprocal_ranks) / len(reciprocal_ranks)

In [31]:
import time

In [32]:
def evaluate_search_methods(ground_truth, search_functions):
    results_file = 'evaluation_results.json'
    
    # Load existing results if available
    if os.path.exists(results_file):
        with open(results_file, 'r') as f:
            results = json.load(f)
        print("Loaded existing results. Continuing from where we left off.")
    else:
        results = {'total_questions': len(ground_truth)}
    
    for name, search_function in search_functions.items():
        if name in results:
            print(f"Skipping {name} as it has already been evaluated.")
            continue
        
        print(f"Starting evaluation of {name}")
        start_time = time.time()
        doc_relevance = []
        chunk_relevance = []
        search_times = []
        
        try:
            for item in tqdm(ground_truth, desc=f"Evaluating {name}"):
                try:
                    query_start_time = time.time()
                    results_list = search_function(item['question'])
                    query_time = time.time() - query_start_time
                    search_times.append(query_time)
                    
                    doc_rel = [doc['doc_id'] == item['doc_id'] for doc in results_list]
                    doc_relevance.append(doc_rel)
                    
                    chunk_rel = [doc['chunk_id'] == item['chunk_id'] for doc in results_list]
                    chunk_relevance.append(chunk_rel)
                except Exception as e:
                    print(f"Error processing query: {item['question']}")
                    print(f"Error details: {str(e)}")
                    # Add empty results for this query
                    doc_relevance.append([])
                    chunk_relevance.append([])
                    search_times.append(0)
            
            total_time = time.time() - start_time
            
            results[name] = {
                'doc_hit_rate': hit_rate(doc_relevance),
                'doc_mrr': mrr(doc_relevance),
                'chunk_hit_rate': hit_rate(chunk_relevance),
                'chunk_mrr': mrr(chunk_relevance),
                'total_time': total_time,
                'avg_query_time': sum(search_times) / len(search_times) if search_times else 0,
                'min_query_time': min(search_times) if search_times else 0,
                'max_query_time': max(search_times) if search_times else 0
            }
            
            # Save results after each successful evaluation
            with open(results_file, 'w') as f:
                json.dump(results, f, indent=2)
            
            print(f"Completed evaluation of {name}")
        
        except Exception as e:
            print(f"Error evaluating {name}: {str(e)}")
            results[name] = {'error': str(e)}
            
            # Save results even if there was an error
            with open(results_file, 'w') as f:
                json.dump(results, f, indent=2)
    
    return results

In [33]:
search_functions = {
    'Text Search': text_search,
    'Text Vector KNN': lambda query, size=5, source=None: elastic_search_knn('text_vector', query, source),
    'Hybrid Search': hybrid_search,
}


evaluation_results = evaluate_search_methods(ground_truth, search_functions)


def print_evaluation_results(results):
    print(f"Total Questions: {results['total_questions']}")
    print("\nEvaluation Results:")
    for method, metrics in results.items():
        if method != 'total_questions':
            print(f"\n{method}:")
            if isinstance(metrics, dict) and 'error' not in metrics:
                print(f"  Document Hit Rate: {metrics['doc_hit_rate']:.4f}")
                print(f"  Document MRR: {metrics['doc_mrr']:.4f}")
                print(f"  Chunk Hit Rate: {metrics['chunk_hit_rate']:.4f}")
                print(f"  Chunk MRR: {metrics['chunk_mrr']:.4f}")
                print(f"  Total Evaluation Time: {metrics['total_time']:.2f} seconds")
                print(f"  Average Query Time: {metrics['avg_query_time']*1000:.2f} ms")
                print(f"  Min Query Time: {metrics['min_query_time']*1000:.2f} ms")
                print(f"  Max Query Time: {metrics['max_query_time']*1000:.2f} ms")
            else:
                print(f"  Error: {metrics.get('error', 'Unknown error occurred')}")

print_evaluation_results(evaluation_results)

Starting evaluation of Text Search


Evaluating Text Search:   0%|          | 0/1950 [00:00<?, ?it/s]

Completed evaluation of Text Search
Starting evaluation of Text Vector KNN


Evaluating Text Vector KNN:   0%|          | 0/1950 [00:00<?, ?it/s]

  es_results = es_client.search(


Completed evaluation of Text Vector KNN
Starting evaluation of Hybrid Search


Evaluating Hybrid Search:   0%|          | 0/1950 [00:00<?, ?it/s]

  es_results = es_client.search(


Completed evaluation of Hybrid Search
Total Questions: 1950

Evaluation Results:

Text Search:
  Document Hit Rate: 0.8349
  Document MRR: 0.6939
  Chunk Hit Rate: 0.8179
  Chunk MRR: 0.6683
  Total Evaluation Time: 124.00 seconds
  Average Query Time: 63.23 ms
  Min Query Time: 0.00 ms
  Max Query Time: 98.97 ms

Text Vector KNN:
  Document Hit Rate: 0.7056
  Document MRR: 0.5286
  Chunk Hit Rate: 0.6641
  Chunk MRR: 0.4871
  Total Evaluation Time: 291.27 seconds
  Average Query Time: 148.56 ms
  Min Query Time: 116.75 ms
  Max Query Time: 713.85 ms

Hybrid Search:
  Document Hit Rate: 0.8359
  Document MRR: 0.6971
  Chunk Hit Rate: 0.8195
  Chunk MRR: 0.6716
  Total Evaluation Time: 332.74 seconds
  Average Query Time: 169.64 ms
  Min Query Time: 96.06 ms
  Max Query Time: 438.67 ms


## Reranking

In [34]:
def compute_rrf(rank, k=60):
    """ Implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(query, k=60):
    vector = model.encode(query).tolist()
    
    knn_query = {
        "field": "text_vector",
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["text^3", "title"],
                    "type": "best_fields",
                    "boost": 0.5,
                    "fuzziness": "AUTO"
                }
            }
        }
    }

    knn_results = es_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']
    
    keyword_results = es_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']
    
    rrf_scores = {}

    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-5 documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results


In [35]:
search_functions['Hybrid Search RRF'] = elastic_search_hybrid_rrf

evaluation_results = evaluate_search_methods(ground_truth, search_functions)
print_evaluation_results(evaluation_results)

Loaded existing results. Continuing from where we left off.
Skipping Text Search as it has already been evaluated.
Skipping Text Vector KNN as it has already been evaluated.
Skipping Hybrid Search as it has already been evaluated.
Starting evaluation of Hybrid Search RRF


Evaluating Hybrid Search RRF:   0%|          | 0/1950 [00:00<?, ?it/s]

  knn_results = es_client.search(
  keyword_results = es_client.search(


Completed evaluation of Hybrid Search RRF
Total Questions: 1950

Evaluation Results:

Text Search:
  Document Hit Rate: 0.8349
  Document MRR: 0.6939
  Chunk Hit Rate: 0.8179
  Chunk MRR: 0.6683
  Total Evaluation Time: 124.00 seconds
  Average Query Time: 63.23 ms
  Min Query Time: 0.00 ms
  Max Query Time: 98.97 ms

Text Vector KNN:
  Document Hit Rate: 0.7056
  Document MRR: 0.5286
  Chunk Hit Rate: 0.6641
  Chunk MRR: 0.4871
  Total Evaluation Time: 291.27 seconds
  Average Query Time: 148.56 ms
  Min Query Time: 116.75 ms
  Max Query Time: 713.85 ms

Hybrid Search:
  Document Hit Rate: 0.8359
  Document MRR: 0.6971
  Chunk Hit Rate: 0.8195
  Chunk MRR: 0.6716
  Total Evaluation Time: 332.74 seconds
  Average Query Time: 169.64 ms
  Min Query Time: 96.06 ms
  Max Query Time: 438.67 ms

Hybrid Search RRF:
  Document Hit Rate: 0.8497
  Document MRR: 0.6489
  Chunk Hit Rate: 0.8328
  Chunk MRR: 0.6159
  Total Evaluation Time: 522.48 seconds
  Average Query Time: 267.18 ms
  Min Query 

#### Hybrid Search RRF seems the best. Text Search is very close and is 4.5 times faster than Hybrid Search RRF and 2.5 times faster than Hybrid Search, so I will using it for now.

# RAG 

In [52]:
def rag(query, size=3, model='gpt-4o-mini', source=None):
    search_results = text_search(query, size, source)
    prompt, context = build_prompt(query, search_results)
    answer = llm(prompt, model=model, max_tokens=250)  # Increased from 150 to 300
    return {
        'query': query,
        'context': context,
        'prompt': prompt,
        'answer': answer,
        'search_results': search_results
    }

In [67]:
question = 'How does Razor DAO determine which users are awarded the R-1 role?'

result = rag(question, size=3, model='gpt-4o-mini', source=None)
print(f"Question: {result['query']}")
print(f"Answer: {result['answer']}")
# print(f"\nContext: {result['context']}")
# print(f"\nPrompt: {result['prompt']}")

Question: How does Razor DAO determine which users are awarded the R-1 role?
Answer: Razor DAO awards the R-1 role to users who actively contribute to the project's development by using the platform or promoting it within their networks. The criteria for receiving this role are based on genuine actions rather than specific tasks, making it accessible to users who are actively engaged in supporting Razor DAO's growth. While there are no guaranteed methods to obtain the R-1 role, the DAO is attentive to contributions and recognizes those who demonstrate a commitment to the community.

To increase their chances of being awarded the R-1 role, users should complete tasks such as staying updated with zealy tasks in the #razor-missions channel and creating detailed threads about the Razor DAO on social media, ensuring to tag the DAO and Movement Labs. Genuine discussions with friends about the project, driven by a belief in its mission rather than a mere promotional effort, are also encourage

## RAG evaluation - LLM-as-a-Judge

In [38]:
# len(ground_truth)

In [39]:
import pandas as pd
import json
from tqdm import tqdm

# Assuming ground_truth data is already available
df_question = pd.DataFrame(ground_truth)
df_sample = df_question.sample(n=100, random_state=1)
sample = df_sample.to_dict(orient='records')

# Define models to compare
models = ['gpt-4o-mini', 'gpt-4o']

In [40]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [42]:
for model in models:
    print(f"Evaluating with model: {model}")
    
    evaluations = []
    
    for record in tqdm(sample, desc=f"Evaluating RAG with {model}"):
        question = record['question']
        rag_result = rag(question, size=3, model=model, source=None)
        answer = rag_result['answer'] 
        
        prompt = prompt2_template.format(
            question=question,
            answer=answer
        )
        
        evaluation = llm(prompt, model=model, max_tokens=150)
        try:
            evaluation = json.loads(evaluation)
            evaluations.append({
                'doc_id': record['doc_id'],
                'chunk_id': record['chunk_id'],
                'question': question,
                'answer': answer,
                'relevance': evaluation['Relevance'],
                'explanation': evaluation['Explanation']
            })
        except json.JSONDecodeError:
            print(f"Error parsing evaluation for question: {question}")
            print("Raw evaluation:", evaluation)

    df_eval = pd.DataFrame(evaluations)

    relevance_distribution = df_eval['relevance'].value_counts(normalize=True)
    print(f"Relevance Distribution for {model}:")
    print(relevance_distribution)

    csv_filename = f'rag-eval-results-{model}.csv'
    df_eval.to_csv(csv_filename, index=False)
    print(f"Saved results to {csv_filename}")

    relevance_counts = df_eval['relevance'].value_counts()
    total_evaluations = len(df_eval)
    
    print(f"Evaluation Results for {model}:")
    for category in ['RELEVANT', 'PARTLY_RELEVANT', 'NON_RELEVANT']:
        count = relevance_counts.get(category, 0)
        percentage = (count / total_evaluations) * 100
        print(f"* {count} ({percentage:.1f}%) `{category}`")

    print(f"\nNon-Relevant Answers for {model}:")
    non_relevant = df_eval[df_eval['relevance'] == 'NON_RELEVANT']
    for _, row in non_relevant.iterrows():
        print(f"Question: {row['question']}")
        print(f"Answer: {row['answer']}")
        print(f"Explanation: {row['explanation']}")
        print("-" * 50)



Evaluating with model: gpt-4o-mini


Evaluating RAG with gpt-4o-mini:  66%|████████████████████████████████████████████▏                      | 66/100 [04:09<02:07,  3.75s/it]

Error parsing evaluation for question: What is the title of the document?
Raw evaluation: {
  "Relevance": "RELEVANT",
  "Explanation": "The generated answer directly provides the title of the document requested in the question, specifically stating 'The title of the document is "Introduction | Movewiffrens."' This aligns perfectly with the query about the document's title."
}


Evaluating RAG with gpt-4o-mini: 100%|██████████████████████████████████████████████████████████████████| 100/100 [06:28<00:00,  3.88s/it]


Relevance Distribution for gpt-4o-mini:
relevance
RELEVANT           0.919192
NON_RELEVANT       0.040404
PARTLY_RELEVANT    0.040404
Name: proportion, dtype: float64
Saved results to rag-eval-results-gpt-4o-mini.csv
Evaluation Results for gpt-4o-mini:
* 91 (91.9%) `RELEVANT`
* 4 (4.0%) `PARTLY_RELEVANT`
* 4 (4.0%) `NON_RELEVANT`

Non-Relevant Answers for gpt-4o-mini:
Question: What is the main purpose of the Razor DAO?
Answer: The context provided does not contain information regarding the main purpose of the Razor DAO. Please provide additional details or specify another resource where this information might be found.
Explanation: The generated answer does not address the question about the main purpose of the Razor DAO. Instead, it states a lack of information and requests additional details, which does not provide any relevant content concerning the main purpose of the Razor DAO.
--------------------------------------------------
Question: What do users need to do to register succe

Evaluating RAG with gpt-4o: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [06:11<00:00,  3.72s/it]

Relevance Distribution for gpt-4o:
relevance
RELEVANT           0.79
PARTLY_RELEVANT    0.15
NON_RELEVANT       0.06
Name: proportion, dtype: float64
Saved results to rag-eval-results-gpt-4o.csv
Evaluation Results for gpt-4o:
* 79 (79.0%) `RELEVANT`
* 15 (15.0%) `PARTLY_RELEVANT`
* 6 (6.0%) `NON_RELEVANT`

Non-Relevant Answers for gpt-4o:
Question: What is the main purpose of the Razor DAO?
Answer: The CONTEXT provided does not contain specific information about Razor DAO or its main purpose. Please provide more details or clarify your question. For example, if you have a specific document, link, or section in mind that discusses Razor DAO, sharing that would help in providing a more precise answer.
Explanation: The generated answer does not provide any information about the main purpose of Razor DAO. Instead, it requests more details or clarification, which does not address the question asked.
--------------------------------------------------
Question: How many people have participat




### User query rewriting

In [79]:
def rewrite_query(original_query, model='gpt-4o-mini'):
    rewrite_prompt = f"""
    You are a helpful assistant skilled at refining questions to make them clearer and more specific. 
    Please rewrite the following query by breaking it down into more precise terms, adding context if necessary, 
    and ensuring it can be answered easily. 
    Do not include any additional comments or explanations.

    Consider aspects such as specificity, clarity, and context. 

    Original query: {original_query}
    """
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": rewrite_prompt}],
            max_tokens=80  # Adjust as needed for length
        )
        rewritten_query = response.choices[0].message.content.strip()
        return rewritten_query
    except Exception as e:
        print(f"An error occurred during query rewriting: {e}")
        return original_query


In [80]:
def rag_rewritten(query, size=3, model='gpt-4o-mini', source=None):
    rewritten_query = rewrite_query(query, model)
    search_results = text_search(rewritten_query, size, source)
    prompt, context = build_prompt(rewritten_query, search_results)
    answer = llm(prompt, model=model, max_tokens=250)
    return {
        'original_query': query,
        'rewritten_query': rewritten_query,
        'context': context,
        'prompt': prompt,
        'answer': answer,
        'search_results': search_results
    }

In [82]:
question = 'What is the main purpose of the Razor DAO?'

result = rag_rewritten(question, size=3, model='gpt-4o-mini', source=None)
print(f"Question: {result['original_query']}")
print(f"Question: {result['rewritten_query']}")
print(f"Answer: {result['answer']}")

Question: What is the main purpose of the Razor DAO?
Question: What is the primary objective or goal of the Razor Decentralized Autonomous Organization (DAO)?
Answer: The primary objective of the Razor Decentralized Autonomous Organization (DAO) is to support and enhance the Movement Labs ecosystem by providing essential services like a native wallet and a decentralized exchange protocol (RazorDex) across the M1, M2, and MEVM networks. This facilitates secure and efficient transactions within the ecosystem, enabling users to manage their digital assets seamlessly.

RazorDAO aims to foster a decentralized finance environment by empowering users with direct control over their funds and trading activities through these innovative tools. The organization promotes community-driven governance and participation, aligning with the core principles of decentralization and user empowerment in the Movement Network.


In [83]:
def evaluate_rag_rewritten(sample, model):
    evaluations = []
    
    for record in tqdm(sample, desc=f"Evaluating RAG with {model}"):
        question = record['question']
        rag_result = rag_rewritten(question, size=3, model=model, source=None)
        answer = rag_result['answer']
        rewritten_query = rag_result['rewritten_query']
        
        prompt = prompt2_template.format(
            question=question,
            answer=answer
        )
        
        evaluation = llm(prompt, model=model, max_tokens=150)
        try:
            evaluation = json.loads(evaluation)
            evaluations.append({
                'doc_id': record['doc_id'],
                'chunk_id': record['chunk_id'],
                'original_question': question,
                'rewritten_question': rewritten_query,
                'answer': answer,
                'relevance': evaluation['Relevance'],
                'explanation': evaluation['Explanation']
            })
        except json.JSONDecodeError:
            print(f"Error parsing evaluation for question: {question}")
            print("Raw evaluation:", evaluation)
    
    return evaluations

In [84]:
models = ['gpt-4o-mini', 'gpt-4o']

for model in models:
    print(f"Evaluating with model: {model}")
    
    evaluations = evaluate_rag_rewritten(sample, model)
    
    df_eval = pd.DataFrame(evaluations)
    relevance_distribution = df_eval['relevance'].value_counts(normalize=True)
    print(f"Relevance Distribution for {model}:")
    print(relevance_distribution)
    
    csv_filename = f'rag-eval-results-with-rewriting-{model}.csv'
    df_eval.to_csv(csv_filename, index=False)
    print(f"Saved results to {csv_filename}")
    
    relevance_counts = df_eval['relevance'].value_counts()
    total_evaluations = len(df_eval)
    
    print(f"Evaluation Results for {model}:")
    for category in ['RELEVANT', 'PARTLY_RELEVANT', 'NON_RELEVANT']:
        count = relevance_counts.get(category, 0)
        percentage = (count / total_evaluations) * 100
        print(f"* {count} ({percentage:.1f}%) `{category}`")
    
    print(f"\nNon-Relevant Answers for {model}:")
    non_relevant = df_eval[df_eval['relevance'] == 'NON_RELEVANT']
    for _, row in non_relevant.iterrows():
        print(f"Original Question: {row['original_question']}")
        print(f"Rewritten Question: {row['rewritten_question']}")
        print(f"Answer: {row['answer']}")
        print(f"Explanation: {row['explanation']}")
        print("-" * 50)

Evaluating with model: gpt-4o-mini


Evaluating RAG with gpt-4o-mini: 100%|██████████████████████████████████████████████████████████████████| 100/100 [09:28<00:00,  5.69s/it]


Relevance Distribution for gpt-4o-mini:
relevance
RELEVANT           0.82
PARTLY_RELEVANT    0.10
NON_RELEVANT       0.08
Name: proportion, dtype: float64
Saved results to rag-eval-results-with-rewriting-gpt-4o-mini.csv
Evaluation Results for gpt-4o-mini:
* 82 (82.0%) `RELEVANT`
* 10 (10.0%) `PARTLY_RELEVANT`
* 8 (8.0%) `NON_RELEVANT`

Non-Relevant Answers for gpt-4o-mini:
Original Question: What is the objective of Mission 4 in the Seekers Alliance?
Rewritten Question: What is the specific goal or objective of Mission 4 within the Seekers Alliance game or organization? Please provide details about the mission's requirements and intended outcomes.
Answer: The provided context does not include specific information regarding "Mission 4" within the Seekers Alliance. It details Missions 1 through 2 and mentions ongoing activities, but there are no explicit requirements or outcomes associated with Mission 4.

If you have any additional details or another source regarding Mission 4, please s

Evaluating RAG with gpt-4o: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [12:59<00:00,  7.80s/it]

Relevance Distribution for gpt-4o:
relevance
RELEVANT           0.61
PARTLY_RELEVANT    0.23
NON_RELEVANT       0.16
Name: proportion, dtype: float64
Saved results to rag-eval-results-with-rewriting-gpt-4o.csv
Evaluation Results for gpt-4o:
* 61 (61.0%) `RELEVANT`
* 23 (23.0%) `PARTLY_RELEVANT`
* 16 (16.0%) `NON_RELEVANT`

Non-Relevant Answers for gpt-4o:
Original Question: What is the main purpose of the Razor DAO?
Rewritten Question: What is the primary objective of the Razor Decentralized Autonomous Organization (DAO)?

- Could you describe the core mission or goal of Razor DAO?
- What key functions or activities does Razor DAO focus on?
- How does Razor DAO differentiate itself from other DAOs?
Answer: The CONTEXT does not contain specific information about the Razor Decentralized Autonomous Organization (DAO), its core mission, key functions, or how it differentiates itself from other DAOs. 

To provide an accurate answer, please provide more information or clarify your question a




Query rewritting doesn't seem to help with relevancy. Maybe needs some better prompting, tried 2-3 prompts already.