## Import original FAQs

In [1]:
import os
import json
import re
import getpass


In [3]:
filename = '../FAQs_cleaned.json'

with open(filename, 'r') as file:
    faqs_data = json.load(file)

print("JSON file imported successfully.")

JSON file imported successfully.


In [4]:
len(faqs_data)

34

## Use real FAQs

We already extracted text in format "question", "answer", "links". Here we will also add "rag_answer", "rag_links".



## Main functionality 

```python
def rag(query):
    search_results = search(query)  
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

```
    

### Connect OpenAI

In [5]:
openai_api_key = getpass.getpass("Enter your OpenAI API Key:\n\n")


Enter your OpenAI API Key:

 ········


In [7]:
# Checks connection to OpenAI 
import openai
from openai import OpenAI
client = OpenAI(api_key = openai_api_key)
try:
  #Make your OpenAI API request here
  response = client.chat.completions.create(
    messages=[{"role": "user", "content": "Hello world"}],
    model="gpt-4o-mini"
  )
except openai.APIError as e:
  #Handle API error here, e.g. retry or log
  print(f"OpenAI API returned an API Error: {e}")
  pass
except openai.APIConnectionError as e:
  #Handle connection error here
  print(f"Failed to connect to OpenAI API: {e}")
  pass
except openai.RateLimitError as e:
  #Handle rate limit error (we recommend using exponential backoff)
  print(f"OpenAI API request exceeded rate limit: {e}")
  pass

In [8]:
# query = 'hi'
# generate_answer(query)

In [9]:
def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""

    # Check for valid input
    if not text or not isinstance(text, str):
        return None

    try:
        # Call OpenAI API to get the embedding
        openai.api_key = openai_api_key
        
        embedding = openai.embeddings.create(
            input=text,
            model="text-embedding-3-small", dimensions=1536).data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None

#q = 'Was sind die rechtlichen Grundlagen meines Studiums?'
#get_embedding(q)

### Vector Search

In [10]:
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel

In [11]:
mongo_uri = getpass.getpass("Enter your MongoDB connection string:\n\n")

Enter your MongoDB connection string:

 ········


In [33]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""

    # gateway to interacting with a MongoDB database cluster
    client = MongoClient(mongo_uri)
    print("Connection to MongoDB successful")
    return client

mongo_client = get_mongo_client(mongo_uri)

# Pymongo client of database and collection
db = mongo_client.get_database('chatter')
collection = db.get_collection('embedded_content')

Connection to MongoDB successful


In [13]:
text_embedding_field_name = "embedding"

def vector_search(user_query, db, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    db (MongoClient.database): The database object.
    collection (MongoCollection): The MongoDB collection to search.
    additional_stages (list): Additional aggregation stages to include in the pipeline.

    Returns:
    list: A list of matching documents.

    Notes:
    This function is from https://learn.deeplearning.ai/courses/prompt-compression-and-query-optimization.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search stage
    vector_search_stage = {
        "$vectorSearch": {
            "index": 'vector_index', # specifies the index to use for the search
            "queryVector": query_embedding, # the vector representing the query
            "path": text_embedding_field_name, # field in the documents containing the vectors to search against
            "numCandidates": 150, # number of candidate matches to consider
            "limit": 5 # return top 5 matches
        }
    }

    # Add a $project stage to include the URL field
    project_stage = {
        "$project": {
            "url": 1,  # Include the URL field
            "score": {"$meta": "vectorSearchScore"},  # Include the search score if needed
            # Add any other fields you want to include
        }
    }
    
    # Define the aggregate pipeline with the vector search stage and projection stage
    pipeline = [vector_search_stage, project_stage]


    # Execute the search
    results = collection.aggregate(pipeline)

    explain_query_execution = db.command( # sends a database command directly to the MongoDB server
        'explain', { # return information about how MongoDB executes a query or command without actually running it
            'aggregate': collection.name, # specifies the name of the collection on which the aggregation is performed
            'pipeline': pipeline, # the aggregation pipeline to analyze
            'cursor': {} # indicates that default cursor behavior should be used
        }, 
        verbosity='executionStats') # detailed statistics about the execution of each stage of the aggregation pipeline


    vector_search_explain = explain_query_execution['stages'][0]['$vectorSearch']
    millis_elapsed = vector_search_explain['explain']['collectStats']['millisElapsed']

    print(f"Total time for the execution to complete on the database server: {millis_elapsed} milliseconds")

    urls = [doc['url'] for doc in results]

    # print("Extracted URLs:", urls)

    return list(results), urls

In [21]:
# q = 'Was sind die rechtlichen Grundlagen meines Studiums?'
# vector_search(q, db, collection)[0]['text']

### Build a RAG system 

In [34]:
prompt_real_faqs = """
You a helpful chat-bot for the student administration office answering the students' questions.
Formulate an answer only based on a provided context. The record contains:

question: {question}
context: {text}

Give the answer in German. Don't use `\n\n`, `\n\n*` and other formating in your answer. 

""".strip()


def build_prompt(prompt_template , query, context):
    
    prompt = prompt_template.format(question = query, text = context).strip()
    
    return prompt

In [15]:
def generate_answer(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    response_content = response.choices[0].message.content
    return response_content


In [16]:
doc = faqs_data[0]

prompt = build_prompt(prompt_real_faqs ,  doc['question'], doc['answer'])
response = generate_answer(prompt)

response

'Die rechtlichen Grundlagen Ihres Studiums an der Universität Basel sind in verschiedenen Ordnungen festgelegt. Für alle Studierenden gilt die Studierendenordnung. Darüber hinaus gibt es spezifische Ordnungen für das Bachelor- und Masterstudium, die von der Philosophisch-Naturwissenschaftlichen Fakultät der Universität Basel herausgegeben werden. Diese Ordnungen regeln die Rahmenbedingungen für alle Studierenden, die an der Fakultät immatrikuliert sind oder Lehrveranstaltungen besuchen, die von dieser Fakultät angeboten werden.'

In [17]:
def get_context(query, db, collection):
    context = ''
    records, urls = vector_search(query, db, collection)
    for record in records:
        context += record['text']

    return context, urls

#get_context(q, db, collection)

In [18]:

def rag(query):
    prompt_template = prompt_real_faqs 
    
    search_results, links = get_context(query, db, collection)
    prompt = build_prompt(prompt_template, query, search_results)
    answer = generate_answer(prompt)
    return answer, links


    

In [35]:
query = faqs_data[0]['question']
human_answer = faqs_data[0]['answer']
 
prompt = build_prompt(prompt_real_faqs, query, human_answer)

almost_real_answer = generate_answer(prompt)

rag_answer, rag_links = rag(query) 

print('------------------------------------------------------------------------------')
print("Q: ", query)
print('------------------------------------------------------------------------------')
print("Human answer: ", human_answer)
print('------------------------------------------------------------------------------')
print("LLM processed answer:", almost_real_answer)
print('------------------------------------------------------------------------------')
print("RAG answer:", rag_answer)


Total time for the execution to complete on the database server: 0.106355 milliseconds
------------------------------------------------------------------------------
Q:  Was sind die rechtlichen Grundlagen meines Studiums?
------------------------------------------------------------------------------
Human answer:  Alle generellen Fragen rund um das Studium an der Universität Basel werden in den folgenden Ordnungen geregelt: Studierendenordnung (gilt für alle Studierenden der Uni Basel):   Bachelorstudium: Ordnung der Philosophisch Naturwissenschaftlichen Fakultät der Universität Basel für das Bachelorstudium-     Masterstudium: Ordnung der Philosophisch-Naturwissenschaftlichen Fakultät der Universität Basel für das MasterstudiumDiese Ordnungen gelten für alle Studierenden,  die entweder an der Phil.-Nat. Fakultät immatrikuliert sind, oder eine Lehrveranstaltung besuchen, die von der Phil.-Nat. Fakultät angeboten wird):  
----------------------------------------------------------------

In [36]:
import numpy as np 

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def compare_strings(str1: str, str2: str):
    # Get embeddings for both strings
    embedding1 = get_embedding(str1)
    embedding2 = get_embedding(str2)
    
    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    
    return similarity


similarity_score_1 = compare_strings(almost_real_answer, human_answer)
similarity_score_2 = compare_strings(almost_real_answer, rag_answer)
similarity_score_3 = compare_strings(human_answer, rag_answer)

print(f"Semantic similarity between Human and LLM processed answers: {similarity_score_1:.4f}\n")
print(f"Semantic similarity between LLM processed and RAG answers: {similarity_score_2:.4f}\n")
print(f"Semantic similarity between Human and RAG answers: {similarity_score_3:.4f}\n")

Semantic similarity between Human and LLM processed answers: 0.8819

Semantic similarity between LLM processed and RAG answers: 0.7056

Semantic similarity between Human and RAG answers: 0.5962



In [23]:
def rag_recall(links, relevant_links):

    """
    Calculate the recall metric for a Retrieval-Augmented Generation (RAG) system.

    This function computes the recall by comparing the retrieved links to the relevant links.
    Recall is defined as the ratio of correctly retrieved relevant links to the total number
    of relevant links.

    Parameters:
    links (list): A list of retrieved links from the RAG system.
    relevant_links (list): A list of links that are considered relevant for the given query.

    Returns:
    float: The recall score, ranging from 0.0 to 1.0.
        - 1.0 if all relevant links are retrieved or if there are no relevant links.
        - 0.0 if no relevant links are retrieved.
        - Otherwise, the fraction of relevant links that were retrieved.

    Examples:
    >>> rag_recall(['1', '2', '3'], ['1', '2'])
    1.0
    >>> rag_recall(['1', '2', '3'], [])
    1.0
    >>> rag_recall(['1', '2', '3'], ['5'])
    0.0
    >>> rag_recall(['1', '2', '3'], ['1', '2', '3', '4'])
    0.75

    Note:
    - The function treats links as sets, so order and duplicates are ignored.
    - If more links are retrieved than are relevant, the maximum recall is still 1.0.
    """
    
    retrieved_links = set(links)
    relevant_set = set(relevant_links)
    
    num_correctly_retrieved = len(retrieved_links.intersection(relevant_set))
    num_relevant = len(relevant_set)

    if num_correctly_retrieved >= num_relevant:
        return 1.0
    elif num_relevant == 0:
        return 1.0
    elif num_correctly_retrieved == 0:
        return 0.0
    else:
        return num_correctly_retrieved/num_relevant

In [30]:
query = faqs_data[0]['question']
human_answer = faqs_data[0]['answer']
relevant_links = faqs_data[0]['links']

rag_context, rag_links =  get_context(query, db, collection)


Total time for the execution to complete on the database server: 0.14888 milliseconds


In [31]:
relevant_links

['https://www.unibas.ch/de/Studium/Studierendenordnung.html',
 'https://philnat.unibas.ch/de/studium/']

In [32]:
rag_links


['https://www.unibas.ch/de/Aktuell/Uni-Nova.html',
 'https://www.unibas.ch/de/Forschung/Uni-Nova.html',
 'https://www.unibas.ch/de/Studium/Studienangebot.html',
 'https://www.unibas.ch/de/Studium/Studienangebot.html',
 'https://nanoscience.unibas.ch/de/forschung/phd-programm/']

In [26]:
rag_recall(rag_links, relevant_links)

0.0

In [59]:
# which is not always representative, because, for example, 
# 'https://philnat.unibas.ch/de/studium/' and  'https://philnat.unibas.ch/de/studium/master/' 
# provide almost the same information

In [62]:
# Save results for further evaluation 

def get_dataset(faqs_data):
    
    test_dataset = []
    for i in range(len(faqs_data)):
    
        query = faqs_data[i]['question'] 
        
        rag_answer, rag_links = rag(query) 
        
        record = {
            'question' : query,
            'answer' : faqs_data[i]['answer'],
            'links' :  faqs_data[i]['links'],
            'rag_answer' : rag_answer, 
            'rag_links' : rag_links
        }
    
        test_dataset.append(record)
        print("Processed record #", i)

    return test_dataset



In [67]:
# test_dataset = get_dataset([faqs_data[0]])
# test_dataset

Total time for the execution to complete on the database server: 0.084718 milliseconds
Extracted URLs: ['https://philnat.unibas.ch/de/studium/master/', 'file:///../../data/446_520BMaSFh_01.pdf', 'file:///../../data/Merkblatt_Pruefungseinsicht_2021.pdf', 'https://philnat.unibas.ch/de/studium/ausserfakultaere-studienfaecher/', 'https://philnat.unibas.ch/de/studium/master/']
Processed record # 0


[{'question': 'Was sind die rechtlichen Grundlagen meines Studiums?',
  'answer': 'Alle generellen Fragen rund um das Studium an der Universität Basel werden in den folgenden Ordnungen geregelt: Studierendenordnung (gilt für alle Studierenden der Uni Basel):   Bachelorstudium: Ordnung der Philosophisch Naturwissenschaftlichen Fakultät der Universität Basel für das Bachelorstudium-\xa0\xa0 \xa0 Masterstudium: Ordnung der Philosophisch-Naturwissenschaftlichen Fakultät der Universität Basel für das MasterstudiumDiese Ordnungen gelten für alle Studierenden, \xa0die entweder an der Phil.-Nat. Fakultät immatrikuliert sind, oder eine Lehrveranstaltung besuchen, die von der Phil.-Nat. Fakultät angeboten wird):  ',
  'links': ['https://www.unibas.ch/de/Studium/Studierendenordnung.html',
   'https://philnat.unibas.ch/de/studium/'],
  'rag_answer': 'Die rechtlichen Grundlagen Ihres Studiums sind in verschiedenen Gesetzen und Vorschriften festgelegt. Dazu gehören in der Regel das Hochschulgesetz d

In [None]:

filename = 'test_dataset.json'

with open(filename, 'w') as f:
    json.dump(test_dataset, f, indent=4)

## Use fake FAQs

The second option is to generate questions based on the provided text, which gives a much larger dataset.

```python
prompt_fake_faqs = """
You emulate a student of the University of Basel.
Formulate 3 questions this student might ask based on the provided text. The record
should contain the answers to the questions, and the questions should be complete and not too short.
If possible, use as few words as possible from the text.

Text: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3"]
""".strip()

```


```python


prompt_evaluation = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response? 
"""
```

In [322]:
prompt_fake_faqs = """
You emulate a student of the University of Basel.
Formulate 3 questions this student might ask based on the provided text. The record
should contain the answers to the questions, and the questions should be complete and not too short.
If possible, use as few words as possible from the text.

Text: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3"]
""".strip()

In [70]:

# query = faqs_data[0]['question']
# human_answer = faqs_data[0]['answer']
# relevant_links = faqs_data[0]['links']

# rag_context, rag_links =  get_context(query, db, collection)

# d = cleaned_docs[0]

 
# prompt = build_prompt(prompt_fake_faqs , d)
# response = generate_new_dataset(prompt)


In [68]:
#prompt

In [69]:
#response