# Chatbot Testing with Random Sampling 

## Overview

This Jupyter Notebook is designed to facilitate the testing of the chatbot using random sampling from a predefined set of sample questions.

To use this notebook, simply specify the file name containing the sample questions, set the desired window size, and choose the number of samples. Optionally, name the experiment and change the testing configurations. Then, proceed by running the remaining cells to perform the tests.

-----------------------------------

## Customize the Dataset Selection  


In [1]:
#Number of different folds the data will be sampled from
num_windows = 10
#Number of samples per window
sample_size = 10
#File path to the data
file_path = r"C:\Users\theak\Downloads\question_bank.xlsx"  #Enter the question bank excel file pathz

#Name of the experiment to be ran
experiment_name = "config_GPTPineconeSolution_Volume_1_rules_metadata"


#Further configuruation options can be set in the Run the Automated Tests section

## Installing Python Packages

In [11]:
!pip install pandas openpyxl

Defaulting to user installation because normal site-packages is not writeable


## Imports

In [4]:
!pip install python-dotenv
!pip install -r ../requirements.txt

Collecting beautifulsoup4==4.12.3 (from -r ../requirements.txt (line 1))
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting boto3==1.34.144 (from -r ../requirements.txt (line 2))
  Downloading boto3-1.34.144-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore==1.34.144 (from -r ../requirements.txt (line 3))
  Downloading botocore-1.34.144-py3-none-any.whl.metadata (5.7 kB)
Collecting deepeval==0.21.60 (from -r ../requirements.txt (line 4))
  Downloading deepeval-0.21.60-py3-none-any.whl.metadata (986 bytes)
Collecting itemadapter==0.9.0 (from -r ../requirements.txt (line 5))
  Downloading itemadapter-0.9.0-py3-none-any.whl.metadata (17 kB)
Collecting langchain_aws==0.1.8 (from -r ../requirements.txt (line 6))
  Downloading langchain_aws-0.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_community==0.2.5 (from -r ../requirements.txt (line 7))
  Downloading langchain_community-0.2.5-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain_core=

## [Example] Creating a new config

In [2]:
import json

# Define your configuration dictionary
experiment_name = "config_GPTPineconeSolution_Volume_1_rules_metadata_rules_filter_third_1"
dataset_name = "the-one-question"

config = {
    "exe_simple_test": False,
    "exe_automated_test": True, # enable this for automated testing
    "exe_streamlit_app": False,
    "chain_config":{
        "solution_class":"GPTPineconeSolution",
        "args":{
            "pinecone_index_name":"vol1-rules-metadata",
            "embed_model":"text-embedding-3-small",
            "gen_model":"gpt-4o",
            "search_type": "mmr",
            "search_k": 8,
            "search_lambda": 0.25,
            "pinecone_key":"be9454fc-929a-4f1a-8daa-145882009d63"
        }
    },
    "automated_test_config":{
        "evaluators":["de_contextual_recall","de_faithfulness","de_correctness"], # pick from available evaluators
        "dataset_name":dataset_name, # name of dataset in langsmith
        "per_q_repeat":1,
        "split_data":True, # runs different groups in the dataset if groups are available
        "splits": ["base"],# default split if no custom splits are created
        "evaluator_model":"gpt-4o",
        "experiment_name": experiment_name # custom experiment name to indetify the run
    }
}

# Define your filename
json_file_path = f"config/automated_tests/{config['automated_test_config']['experiment_name']}.json"

# Write the configuration to a JSON file
with open(json_file_path, 'w') as f:
    json.dump(config, f, indent=4)

print(f"Configuration saved to {json_file_path}")

Configuration saved to config/automated_tests/config_GPTPineconeSolution_Volume_1_rules_metadata_rules_filter_third_1.json


## [Example] Run a test

In [1]:
import pandas as pd
import dotenv
import os
from langsmith import Client
from datetime import datetime

os.environ['AWS_SHARED_CREDENTIALS_FILE'] = "C:/Users/EbrahimAlhaddad/ARRAY/rag_mvp_current./chat-bot/prototype/.aws/credentials"
os.environ['AWS_CONFIG_FILE'] =  "C:/Users/EbrahimAlhaddad/ARRAY/rag_mvp_current./chat-bot/prototype/.aws/config"
os.environ['AWS_REGION'] = 'me-south-1'

from common.chain_generator import generate_chain
from evaluation.test import run_test, _store_test_result_csv 
from common.utils import load_config

json_file_path = "C:/Users/EbrahimAlhaddad/ARRAY/rag_mvp_current/chat-bot/prototype/config/config_GPTPineconeSolution_Volume_1_rules_meta_curated_test_12_doc.json"


chain_config, test_config = load_config(json_file_path)

# generate chain from config file
solution = generate_chain(chain_config)

# run automated test
result = run_test(solution= solution, 
        automated_test_config=test_config["automated_test_config"]
    )

result

  from tqdm.autonotebook import tqdm


NameError: name 'config' is not defined

## [Example] storing experiment results from Langsmith page - if local storing fails

In [None]:

from bs4 import BeautifulSoup
import os, re
import pandas as pd

def print_tree(element, level=0):
    # Ignore style, script, and other unwanted tags
    if element.name not in ['style', 'script']:
        # Print the tag name with indentation
        print('  ' * level + f"<{element.name}>")

        # If the element has children, recursively print their content
        for child in element.children:
            if child.name is not None:  # If the child is a tag
                print_tree(child, level + 1)
            elif child.string.strip():  # If the child is a non-empty string (text)
                print('  ' * (level + 1) + child.string.strip())

def get_text(parent):
    text_parts = []
    for element in parent.find_all(recursive=True):
        # Check if the element contains text (ignoring empty strings and whitespace)
        if element.string and element.string.strip():
            text_parts.append(element.string.strip())
    return text_parts

In [None]:
# Copy html source into a local file
FILE_PATH = "C:/Users/user/evals/experiment.html"

# Set output file path
OUTPUT_CSV = "C:/Users/user/evals/experiment.csv"


with open(FILE_PATH, 'r') as file:
    html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
# Example 1: Find all elements with a specific tag
all_list = soup.find_all('tr')  # Finds all <p> tags
# Define a list to hold the row data
data = []
for i in range(1, len(all_list)):
    # Fetch the td elements
    all_threads = all_list[i].find_all('td')
    
    # Extract the values using your get_text function
    input_question = get_text(all_threads[0])[0]
    reference_answer = get_text(all_threads[1])[0]
    output_answer = get_text(all_threads[2])[0]
    
    # Ensure you have valid indexes and handle any potential errors
    try:
        recall = float(get_text(all_threads[2])[5])
        correctness = float(get_text(all_threads[2])[9])
        faithfulness = float(get_text(all_threads[2])[13])
    except Exception as e:
        recall = correctness = faithfulness = None
        print("failed on a row")
    # Append the row as a list of values
    data.append([input_question, reference_answer, output_answer, recall, correctness, faithfulness])
# Create a pandas DataFrame from the collected data
df = pd.DataFrame(data, columns=['Input Question', 'Reference Answer', 'Output Answer', 'Recall', 'Correctness', 'Faithfulness'])
# store locally
df.to_csv(OUTPUT_CSV)

## [Example] query a single question for a response

In [9]:
from common.chain_generator import generate_chain
# generate chain from config file
solution = generate_chain(config["chain_config"])

# ask chain a simple question
print("Invoking: ")
result = solution.invoke("What was clarified in Rule AU-A.1.3 in January 2011?")
print(result.response)
for citation in result.context:
    print(citation)


Invoking: 


2024-08-20 13:18:46,427 [INFO] {_client::_send_single_request} HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-20 13:18:52,935 [INFO] {_client::_send_single_request} HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In January 2011, Rule AU-A.1.3 was clarified to categorize the Module as a Directive. Specifically, the rule states:

"This Module contains the CBB's Directive (as amended from time to time) relating to auditors and accounting standards used by conventional bank licensees, and is issued under the powers available to the CBB under Article 38 of the CBB Law. The Directive in this Module is applicable to all conventional bank licensees."

You can find more information [here](https://cbben.thomsonreuters.com/rulebook/au-a13-0).
page_content='##  AU-A.1 Purpose\n\n  * [ AU-A.1.1 ](/rulebook/au-a11-3)\n  * [ AU-A.1.2 ](/rulebook/au-a12-3)\n  * [ Legal Basis ](/rulebook/legal-basis-42)\n    * [ AU-A.1.3 ](/rulebook/au-a13-0)\n    * [ AU-A.1.4 ](/rulebook/au-a14-0)\n\n' metadata={'link': 'https://cbben.thomsonreuters.com/rulebook/au-a1-purpose-3', 'name': 'rulebook_au-a1-purpose-3.md', 'path': 'Central Bank of Bahrain Volume 1—Conventional Banks/Part A/High Level Standards/AU Auditors and Acco

## [Example] Creating a subset of the test data by random sampling

In [None]:
# Function to split and sample questions from windows
def split_and_sample_questions(df, num_windows, sample_size):
    num_questions = len(df)
    if num_windows <= 0 or sample_size < 0:
        raise ValueError("Number of windows and sample size must be positive integers.")
    if num_windows * sample_size > num_questions:
        raise ValueError("Number of windows times sample size exceeds the total number of questions available.")
    
    # Calculate window size
    window_size = max(1, num_questions // num_windows)
    sampled_indices = []
    
    # Split into windows and sample
    for i in range(num_windows):
        start_index = i * window_size
        end_index = start_index + window_size
        window = df.iloc[start_index:end_index]
        sampled_indices.extend(window.sample(min(sample_size, len(window))).index)
    
    return df.loc[sampled_indices]


In [None]:
# Read data from Excel file
df = pd.read_excel(file_path)

# Check if the necessary columns exist         
required_columns = ['question', 'answer']
if not all(column in df.columns for column in required_columns):
    raise ValueError(f"Excel file must contain the following columns: {', '.join(required_columns)}")

#set the test dataset name including num of windows and sample size with date time stamp
dataset_name = f"test-dataset_vol1_{num_windows}_windows_{sample_size}_sample_{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}"
print(f"Dataset name: {dataset_name}")

In [None]:
# Perform the dynamic check and sample questions
try:
    sampled_df = split_and_sample_questions(df, num_windows, sample_size)
    # Print sampled questions and answers
    print(sampled_df)
    
    dotenv.load_dotenv()    
    # Post the testing dataset to LangSmith
    api_key = os.getenv("LANGCHAIN_API_KEY")
    client = Client(api_key=api_key)

    # Convert DataFrame to dictionary format
    sampled_data = sampled_df.to_dict(orient='records')

    # Define input and output columns based on the Excel header row
    input_keys = ['question']  # This should match the column name for questions
    output_keys = ['answer']   # This should match the column name for answers

    # Upload the DataFrame as a dataset to LangSmith
    dataset = client.upload_dataframe(
        df=sampled_df,
        input_keys=input_keys,
        output_keys=output_keys,
        name=dataset_name,
        description="A dataset of sampled questions and answers from Excel for volume 1",
        data_type="kv"  # The default type
    )

    # Print the dataset object to understand its structure
    print("Dataset uploaded to LangSmith:", dataset)

except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"An error occurred: {e}")
