## Import original FAQs

In [1]:
import os
import json
import re
import getpass
import random
from pydantic import BaseModel
from typing import List, Optional

In [2]:
filename = 'FAQs_cleaned.json'

with open(filename, 'r') as file:
    faqs_data = json.load(file)

print("JSON file imported successfully.")

JSON file imported successfully.


In [3]:
len(faqs_data)

34

## Use real FAQs

We already extracted text in format "question", "answer", "links". Here we will also add "rag_answer", "rag_links".



## Main functionality 

```python
def rag(query):
    search_results = search(query)  
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

```
    

### Connect OpenAI

In [4]:
openai_api_key = getpass.getpass("Enter your OpenAI API Key:\n\n")


Enter your OpenAI API Key:

 ········


In [5]:
# Checks connection to OpenAI 
import openai
from openai import OpenAI
client = OpenAI(api_key = openai_api_key)
try:
  #Make your OpenAI API request here
  response = client.chat.completions.create(
    messages=[{"role": "user", "content": "Hello world"}],
    model="gpt-4o-mini"
  )
except openai.APIError as e:
  #Handle API error here, e.g. retry or log
  print(f"OpenAI API returned an API Error: {e}")
  pass
except openai.APIConnectionError as e:
  #Handle connection error here
  print(f"Failed to connect to OpenAI API: {e}")
  pass
except openai.RateLimitError as e:
  #Handle rate limit error (we recommend using exponential backoff)
  print(f"OpenAI API request exceeded rate limit: {e}")
  pass

In [6]:
# query = 'hi'
# generate_answer(query)

In [6]:
def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""

    # Check for valid input
    if not text or not isinstance(text, str):
        return None

    try:
        # Call OpenAI API to get the embedding
        openai.api_key = openai_api_key
        
        embedding = openai.embeddings.create(
            input=text,
            model="text-embedding-ada-002").data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None

#q = 'Was sind die rechtlichen Grundlagen meines Studiums?'
#get_embedding(q)

### Vector Search

In [7]:
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel

In [8]:
mongo_uri = getpass.getpass("Enter your MongoDB connection string:\n\n")

Enter your MongoDB connection string:

 ········


In [9]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""

    # gateway to interacting with a MongoDB database cluster
    client = MongoClient(mongo_uri)
    print("Connection to MongoDB successful")
    return client

mongo_client = get_mongo_client(mongo_uri)

# Pymongo client of database and collection
db = mongo_client.get_database('chatter')
collection = db.get_collection('embedded_content')

Connection to MongoDB successful


In [99]:
#collection.find_one()

In [103]:
text_embedding_field_name = "embedding"

def vector_search(user_query, db, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.
    """
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    vector_search_stage = {
        "$vectorSearch": {
            "index": 'vector_index',
            "queryVector": list(query_embedding),
            "path": text_embedding_field_name,
            "numCandidates": 150,
            "limit": 5,
            "minScore": 0.4,
            "similarity": "cosine"
        }
    }

    project_stage = {
        "$project": {
            "text": 1,
            "metadata": 1,
            "url": 1,
            "score": {"$meta": "vectorSearchScore"}
        }
    }

    pipeline = [vector_search_stage, project_stage]

    results = collection.aggregate(pipeline)

    processed_results = []
    for doc in results:
        if 'text' not in doc or 'metadata' not in doc or 'url' not in doc:
            logging.warning(f"Document missing fields: {doc}")
        
        processed_results.append({
            "source": doc['metadata'].get('source', doc.get('url', 'N/A')),
            "text": doc.get('text', 'N/A'),
            "score": doc.get('score', 0)
        })

    return processed_results

In [97]:
# q = 'Was sind die rechtlichen Grundlagen meines Studiums?'
# search_res = vector_search(q, db, collection)



In [100]:
# context = ''

# for i in range(len(search_res)):
#     context += search_res[i]['text'] 
#     context += ' \n '

### Build a RAG system 

In [104]:
prompt_real_faqs = """
You are a friendly human-like chatbot. Use relevant provided context and chat history to answer the query at the end. 
Answer in full. If you don't know the answer, say that you don't know, don't try to make up an answer. 
Do not use words like context or training data when responding. You can say you do not have all the information
but do not indicate that you are not a reliable source.

Using the following information, answer the user query.

query: {question}
context: {text}
   
""".strip()


def build_prompt(prompt_template , query, context):
    
    prompt = prompt_template.format(question = query, text = context).strip()
    
    return prompt

In [105]:
def generate_answer(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    response_content = response.choices[0].message.content
    return response_content


In [None]:
# doc = faqs_data[0]

# prompt = build_prompt(prompt_real_faqs ,  doc['question'], doc['answer'])
# response = generate_answer(prompt)

# response

In [106]:
def remove_duplicates(sorted_list):
    if not sorted_list:
        return []
    result = [sorted_list[0]]
    for i in range(1, len(sorted_list)):
        if sorted_list[i] != sorted_list[i - 1]:
            result.append(sorted_list[i])
    return result


def get_context(query, db, collection):
    context = ''
    urls = []
    records = vector_search(query, db, collection)
    for record in records:
        context += record['text']
        context += ' \n '
        urls.append(record['source'])
    
    return context, remove_duplicates(urls) 

#get_context( faqs_data[0]['question'], db, collection)

In [107]:
def rag(query):
    prompt_template = prompt_real_faqs 
    
    search_results, links = get_context(query, db, collection)
    prompt = build_prompt(prompt_template, query, search_results)
    answer = generate_answer(prompt)
    
    return answer, links


In [108]:
query = faqs_data[0]['question']
human_answer = faqs_data[0]['answer']
 
prompt = build_prompt(prompt_real_faqs, query, human_answer)

almost_real_answer = generate_answer(prompt)

rag_answer, rag_links = rag(query)

In [109]:
print('------------------------------------------------------------------------------')
print("Q: ", query)
print('------------------------------------------------------------------------------')
print("Human answer: ", human_answer)
print('------------------------------------------------------------------------------')
print("LLM processed answer:", almost_real_answer)
print('------------------------------------------------------------------------------')
print("RAG answer:", rag_answer)

------------------------------------------------------------------------------
Q:  Was sind die rechtlichen Grundlagen meines Studiums?
------------------------------------------------------------------------------
Human answer:  Alle generellen Fragen rund um das Studium an der Universität Basel werden in den folgenden Ordnungen geregelt: Studierendenordnung (gilt für alle Studierenden der Uni Basel):   Bachelorstudium: Ordnung der Philosophisch Naturwissenschaftlichen Fakultät der Universität Basel für das Bachelorstudium-     Masterstudium: Ordnung der Philosophisch-Naturwissenschaftlichen Fakultät der Universität Basel für das MasterstudiumDiese Ordnungen gelten für alle Studierenden,  die entweder an der Phil.-Nat. Fakultät immatrikuliert sind, oder eine Lehrveranstaltung besuchen, die von der Phil.-Nat. Fakultät angeboten wird):  
------------------------------------------------------------------------------
LLM processed answer: Die rechtlichen Grundlagen deines Studiums an der 

In [110]:
relevant_links = faqs_data[0]['links']

print("Relevant links:" , relevant_links)
print("Retrieved links:" ,rag_links)


Relevant links: ['https://www.unibas.ch/de/Studium/Studierendenordnung.html', 'https://philnat.unibas.ch/de/studium/']
Retrieved links: ['../../data/446_520BMaSFi_01.pdf', '../../data/446_710_11.pdf', '../../data/441_800_08.pdf', '../../data/446_710_00.pdf', '../../data/446_520BMaSFk_03.pdf']


In [111]:
import numpy as np 

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def compare_strings(str1: str, str2: str):
    # Get embeddings for both strings
    embedding1 = get_embedding(str1)
    embedding2 = get_embedding(str2)
    
    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    
    return similarity


similarity_score_1 = compare_strings(almost_real_answer, human_answer)
similarity_score_2 = compare_strings(almost_real_answer, rag_answer)
similarity_score_3 = compare_strings(human_answer, rag_answer)

print(f"Semantic similarity between Human and LLM processed answers: {similarity_score_1:.4f}\n")
print(f"Semantic similarity between LLM processed and RAG answers: {similarity_score_2:.4f}\n")
print(f"Semantic similarity between Human and RAG answers: {similarity_score_3:.4f}\n")

Semantic similarity between Human and LLM processed answers: 0.9647

Semantic similarity between LLM processed and RAG answers: 0.9207

Semantic similarity between Human and RAG answers: 0.8982



In [58]:
def rag_recall(links, relevant_links):

    """
    Calculate the recall metric for a Retrieval-Augmented Generation (RAG) system.

    This function computes the recall by comparing the retrieved links to the relevant links.
    Recall is defined as the ratio of correctly retrieved relevant links to the total number
    of relevant links.

    Parameters:
    links (list): A list of retrieved links from the RAG system.
    relevant_links (list): A list of links that are considered relevant for the given query.

    Returns:
    float: The recall score, ranging from 0.0 to 1.0.
        - 1.0 if all relevant links are retrieved or if there are no relevant links.
        - 0.0 if no relevant links are retrieved.
        - Otherwise, the fraction of relevant links that were retrieved.

    Examples:
    >>> rag_recall(['1', '2', '3'], ['1', '2'])
    1.0
    >>> rag_recall(['1', '2', '3'], [])
    1.0
    >>> rag_recall(['1', '2', '3'], ['5'])
    0.0
    >>> rag_recall(['1', '2', '3'], ['1', '2', '3', '4'])
    0.75

    Note:
    - The function treats links as sets, so order and duplicates are ignored.
    - If more links are retrieved than are relevant, the maximum recall is still 1.0.
    """
    
    retrieved_links = set(links)
    relevant_set = set(relevant_links)
    
    num_correctly_retrieved = len(retrieved_links.intersection(relevant_set))
    num_relevant = len(relevant_set)

    if num_correctly_retrieved >= num_relevant:
        return 1.0
    elif num_relevant == 0:
        return 1.0
    elif num_correctly_retrieved == 0:
        return 0.0
    else:
        return num_correctly_retrieved/num_relevant

In [59]:
relevant_links

['https://www.unibas.ch/de/Studium/Studierendenordnung.html',
 'https://philnat.unibas.ch/de/studium/']

In [60]:
rag_links


['../../data/446_520BMaSFi_01.pdf',
 '../../data/446_710_11.pdf',
 '../../data/441_800_08.pdf',
 '../../data/446_710_00.pdf',
 '../../data/446_520BMaSFk_03.pdf']

In [61]:
rag_recall(rag_links, relevant_links)

0.0

In [62]:
# which is not always representative, because, for example, 
# 446_720_00.pdf provides the relevant information 

In [63]:
# Save results for further evaluation 

def get_dataset(faqs_data):
    
    test_dataset = []
    for i in range(len(faqs_data)):
    
        query = faqs_data[i]['question'] 
        
        rag_answer, rag_links = rag(query) 
        
        record = {
            'question' : query,
            'answer' : faqs_data[i]['answer'],
            'links' :  faqs_data[i]['links'],
            'rag_answer' : rag_answer, 
            'rag_links' : rag_links
        }
    
        test_dataset.append(record)
        print("Processed record #", i)

    return test_dataset



In [130]:
test_dataset = get_dataset(faqs_data)
#test_dataset

Processed record # 0
Processed record # 1
Processed record # 2
Processed record # 3
Processed record # 4
Processed record # 5
Processed record # 6
Processed record # 7
Processed record # 8
Processed record # 9
Processed record # 10
Processed record # 11
Processed record # 12
Processed record # 13
Processed record # 14
Processed record # 15
Processed record # 16
Processed record # 17
Processed record # 18
Processed record # 19
Processed record # 20
Processed record # 21
Processed record # 22
Processed record # 23
Processed record # 24
Processed record # 25
Processed record # 26
Processed record # 27
Processed record # 28
Processed record # 29
Processed record # 30
Processed record # 31
Processed record # 32
Processed record # 33


In [131]:

filename = 'test_dataset.json'

with open(filename, 'w') as f:
    json.dump(test_dataset, f, indent=4)

## Use fake FAQs

The second option is to generate questions based on the provided text, which gives a much larger dataset.

```python

prompt_evaluation = """
Question: {question}
Expected Response: {expected_response}
Actual Response: {actual_response}

(Answer with 'True' or 'False') Does the Actual Response provide the necessary information for the question as the Expected Response does?
""".strip()

```

In [112]:
prompt_fake_faqs = """
You emulate a student of the University of Basel.
Formulate a question this student might ask based on the provided text. 
The question should be complete and not too short. Provide only a text of the question 
in German without any additional characters or information.

Text: {text}

""".strip()

In [113]:
# Step 1 - retrieve a random chunk from the database
# Step 2 - build a prompt 
# Step 3 - save questions 
# Step 4 - get answers using the built RAG system

In [114]:
def get_random_record(db, collection):
    """
    Retrieve a random record from the MongoDB collection.

    Args:
    db (MongoClient.database): The database object.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    dict: A randomly selected document from the collection, or None if the collection is empty.
    """

    # Count the total number of documents in the collection
    total_documents = collection.count_documents({})

    if total_documents == 0:
        return None

    # Define the aggregation pipeline
    pipeline = [
        {"$sample": {"size": 1}},
        {"$project": {
            "text": 1,
            "metadata": 1,
            "url": 1,
            "_id": 0
        }}
    ]

    # Execute the aggregation
    result = list(collection.aggregate(pipeline))

    if result:
        doc = result[0]
        return {
            "source": doc['metadata'].get('source', doc.get('url', 'N/A')),
            "text": doc.get('text', 'N/A'),
            "url": doc.get('url', 'N/A')
        }
    else:
        return None

    # Optional: Add explain functionality if needed
    # explain_query_execution = db.command(
    #     'explain', {
    #         'aggregate': collection.name,
    #         'pipeline': pipeline,
    #         'cursor': {}
    #     },
    #     verbosity='executionStats'
    # )

    # print(f"Explanation of query execution: {explain_query_execution}")

    # return result[0] if result else None 

In [119]:
rec = get_random_record(db, collection)

In [126]:
def rag_test(db, collection):
    prompt_template = prompt_fake_faqs 
    random_result = get_random_record(db, collection)

    context = random_result['text']
    links = [random_result['source']]


    prompt = prompt_template.format(text = context).strip()
    question = generate_answer(prompt)

    # to get the answer without search
    prompt_helper =  build_prompt(prompt_real_faqs, question, context)

    answer = generate_answer(prompt_helper)
    
    return question, answer, links

In [135]:

n = 45

# fake_records = [] 

# for i in range(n):
#     q, a, l = rag_test(db, collection)
#     rag_answer, rag_links = rag(q) 
#     rag_links_cleaned = remove_duplicates(rag_links)
#     fake_records.append({'question': q, 'answer':a, 'links' : l, 'rag_answer' : rag_answer, 'rag_links' : rag_links_cleaned })
#     print("Processed record #", i)


In [None]:
#fake_records

In [136]:

filename = 'fake_dataset.json'

with open(filename, 'w') as f:
    json.dump(fake_records, f, indent=4)

---
* Author: Anastasiia Popova
* Email: anastasiia.popova@stud.unibas.ch

[Perplexity AI](https://www.perplexity.ai/) assisted in code writing, editing, and more effective information searches. The generated output underwent critical evaluation. The author is solely responsible for the content.