## (5) Case Study: Evaluate models on use cases and for safety
##### (GenAI Life Cycle Phase 5: Evaluation self-assesment)

---

In [3]:
import ipywidgets as widgets
from IPython.display import display

# Cloud-Based LLM and RAG Content
cloud_llm_rag_data = [
    [
        "<b>(4a) Prompt Engineering in Google AI Studio:</b>",
        (
            """
            <div>
                <p>You should have designed and refined your own prompt in Google AI Studio to ensure the Gemini model 
                accurately interprets user intents and generates contextually relevant responses.</p>
                <p>Below is a code segment you can use with a prompt usable for the given case:</p>
                <div style='border: 1px dashed #1e7e34; padding: 10px; margin-top: 10px;'>
                    <b>Code Segment:</b>
                    <pre style='background-color: #f8f9fa; border: 1px solid #ccc; padding: 10px; font-family: monospace;'>
# Import Google GenerativeAI Python module
import google.generativeai as genai

# Define Gemini API key
genai.configure(api_key="YOUR_GEMINI_API_KEY")

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

# Specify model name and define system instruction
model = genai.GenerativeModel(
  model_name="gemini-1.5-pro",
  generation_config=generation_config,
  system_instruction=\"\"\"You are a virtual restaurant and bar recommendation assistant. Your goal is to provide users with highly personalized recommendations based on their preferences and needs.

Here are some guidelines to follow:

- Understand the User's Intent: Carefully analyze the user's query.
- Leverage User Preferences: Utilize the user's past behavior.
- Consider Dietary Restrictions: Factor in dietary restrictions.
- Provide Relevant Information: Offer details like cuisine type, price, and ambiance.
- Handle Ambiguous Queries: Ask clarifying questions.
- Be Conversational and Engaging: Maintain a friendly tone.\"\"\"
)

# Acceptable past chat for reference
chat_session = model.start_chat(
  history=[
    {"role": "user", "parts": ["Hello"]},
    {"role": "model", "parts": ["Hello there! I am a virtual agent for Welp!"]},
  ]
)
                    </pre>
                </div>
                <div style='margin-top: 10px;'>
                    <a href='case-files/ailtk-running-code-case-3.ipynb' target='_blank' style='color: #1e7e34; text-decoration: underline;'>Click here to open Solution: Case Study 4 in Visual Studio Code</a>
                </div>
            </div>
            """
        )
    ],
    [
        "<b>(4b) API Integration:</b>",
        (
            "You will be able to reuse your API key from Practice Learning Activity 4 for the sake of this case.<br>"
            "<div style='margin-top: 10px;'>"
            "<a href='#' target='_blank' style='color: #1e7e34; text-decoration: underline;'>Click here to review receiving and accessing your API key</a>"
            "</div>"
        )
    ],
    [
        "<b>(4c) Data Utilization with RAG:</b>",
        (
            "Apply Retrieval-Augmented Generation (RAG) techniques to combine the fine-tuned Gemini model with curated datasets.<br>"
            "<div style='border: 1px dashed #1e7e34; padding: 10px; margin-top: 10px;'>"
            "<b>Code Segment:</b><br><pre style='background-color: #f8f9fa; border: 1px solid #ccc; padding: 10px; font-family: monospace;'>"
            """
# Example: Jaccard Similarity for Document Matching
import pandas as pd

df = pd.read_excel("solution-case-study-activity-3/ailtk-case-apache-hop-output.xls")

corpus = df.apply(lambda row: f"{row['input']}. {row['output']}", axis=1).tolist()

def jaccard_similarity(query, document):
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def return_response(query, corpus, top_n=5):
    similarities = []
    
    # Calculate similarity for each document in the corpus
    for doc in corpus:
        similarity = jaccard_similarity(query, doc)
        similarities.append(similarity)
    
    # Get the indices of the top_n most similar documents
    top_n_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:top_n]
    
    # Return the top_n most similar documents
    top_n_documents = [corpus[i] for i in top_n_indices]
    
    return top_n_documents

# Define a function to find documents similar to the user's input, 
# Provide LLM with an injected prompt, and receive response

def generate_response_with_injected_prompt(user_prompt, corpus, model):
# Generates a response using a model with injected prompt from RAG results.

# Parameters:
# - user_prompt (str): The user's input prompt (e.g., preferences for coffee).
# - corpus (list): The corpus of documents to search for similarities.
# - model (object): The model used to generate content based on the injected prompt.
    
    # RAG result on the user's input
    rag_result = return_response(user_prompt, corpus)
    
    # View five most similar documents from corpus according to jaccard similarity
    print(rag_result)
    
    # Append input to create an injected prompt
    injected_prompt = f"{user_prompt} {rag_result}"
    
    # Call your model and input the injected prompt
    response = model.generate_content(injected_prompt)
    
    # Return the response text
    return response.text

            """
            "</pre></div>"
        )
    ]
]

# Create content for the widget
cloud_llm_rag_content = widgets.VBox([widgets.HTML(value=f"{item[0]}<br>{item[1]}") for item in cloud_llm_rag_data])

# Styled Box for Cloud-Based LLM and RAG
styled_cloud_llm_box = widgets.Box(
    [
        widgets.HTML(
            value="<h3 style='color: #1e7e34;'>Solution of \"(4) Case Study: Accessing cloud-based LLM models and implementing RAG\"</h3>"
        ),
        widgets.HTML(value="<hr style='border: 1px solid #1e7e34;'>"),  # Horizontal line for separation
        cloud_llm_rag_content,
    ],
    layout=widgets.Layout(
        border="2px solid #1e7e34",
        padding="20px",
        width="90%",
        margin="20px 0px"
    )
)

# Display the styled box
display(styled_cloud_llm_box)


Box(children=(HTML(value='<h3 style=\'color: #1e7e34;\'>Solution of "(4) Case Study: Accessing cloud-based LLM…

---

#### Case Scenario
> Welp's restaurant recommendation virtual assistant has now progressed to the evaluation phase, where its capabilities and safety must be thoroughly tested to ensure it meets real-world demands and adheres to ethical AI guidelines. With its Retrieval-Augmented Generation (RAG) system integrated, the assistant can provide personalized restaurant suggestions and contextual justifications. However, it is essential to confirm that these recommendations are accurate, inclusive, and aligned with user expectations.
>
> As the AI developer, your task is to evaluate the assistant’s performance across various use cases. This includes testing its ability to handle diverse user inputs—ranging from vague or incomplete queries to specific and detailed requests. Additionally, you must assess the assistant's adherence to ethical AI principles, such as avoiding biased or discriminatory suggestions, and ensuring that its responses remain neutral and user-focused.
>
> The virtual assistant must also maintain a professional and approachable tone, consistent with Welp's branding, to instill trust and encourage user engagement.
>
> Your Tasks:
> 
> (a) Performance Evaluation:
> Test the virtual assistant’s accuracy and relevance in delivering restaurant recommendations across multiple scenarios, including users with dietary restrictions, specific cuisine preferences, or location constraints. Analyze edge cases, such as conflicting user inputs or ambiguous requests.
>
>(b) Safety and Ethical Testing:
> Examine the assistant’s outputs for potential biases or safety concerns. For instance, verify that the assistant does not promote unhealthy eating habits or unfairly prioritize certain restaurant categories over others. Additionally, ensure that its recommendations remain respectful and appropriate for diverse cultural contexts.
> 
> By the end of this activity, you will have applied best practices for model evaluation, gaining hands-on experience in ensuring that AI systems are not only functional but also safe, ethical, and aligned with user needs.

---

##### Pre-requisite:
- Create a Jupyter Notebook and load your code for RAG (from the previous Chapter.)

### Perform the tasks as follows:


#### **(a) Performance Evaluation:** Test the virtual assistant’s accuracy and relevance in delivering restaurant recommendations across multiple scenarios, including users with dietary restrictions, specific cuisine preferences, or location constraints.

#### **(b) Safety and Ethical Testing:** Examine the assistant’s outputs for potential biases or safety concerns.

<a href='https://huggingface.co/datasets?task_categories=task_categories:question-answering&sort=trending' target='_blank'>Find use cases here</a>

<a href='https://huggingface.co/collections/harpreetsahota/red-teaming-prompts-656256235475849b82a91813' target='_blank'>And here</a>


##### Answer the following to proceed:

In [9]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# Define questions and options
questions = [
    {
        "question": "What is the primary goal of performance testing?",
        "options": [
            "To identify edge cases and test model robustness",
            "To ensure the agent adheres to ethical AI guidelines",
            "To simulate scenarios and ensure accurate recommendations",
            "To train the model on additional industry-specific data"
        ],
        "answer": "To simulate scenarios and ensure accurate recommendations"
    },
    {
        "question": "Which input should the model evaluate for possible ethical problems?",
        "options": [
            "Common user preferences",
            "Beginner coffee drinker queries",
            "Advanced enthusiast requests",
            "\"Red team\" prompts and edge cases"
        ],
        "answer": "\"Red team\" prompts and edge cases"
    },
    {
        "question": "What is a key task in the safety and ethical review of virtual agents?",
        "options": [
            "Testing the agent for biases in recommendations",
            "Adding new features for advanced enthusiasts",
            "Optimizing the agent's response time",
            "Reducing the corpus size for faster processing"
        ],
        "answer": "Testing the agent for biases in recommendations"
    },
    {
        "question": "What should the virtual agent avoid to maintain brand guidelines?",
        "options": [
            "Using an overly casual tone",
            "Suggesting new possible things to try",
            "Providing info from the corpus provided",
            "Recommending unique or less popular options"
        ],
        "answer": "Using an overly casual tone"
    },
    {
        "question": "What tool is suggested for identifying potential vulnerabilities in the AI system?",
        "options": [
            "Red teaming",
            "Data augmentation",
            "Performance tuning",
            "Corpus filtering"
        ],
        "answer": "Red teaming"
    },
    {
        "question": "What is the expected behavior of the agent when handling ambiguous inputs?",
        "options": [
            "It should ask clarifying questions.",
            "It should recommend the most popular coffee.",
            "It should generate a random coffee suggestion.",
            "It should refer the user to customer support."
        ],
        "answer": "It should ask clarifying questions."
    },
    {
        "question": "What criteria should the model meet to pass safety evaluations?",
        "options": [
            "Accurate, unbiased, and practical recommendations",
            "Speed of response under 0.5 seconds",
            "Support for all global languages",
            "Ability to generate a complete coffee brewing guide"
        ],
        "answer": "Accurate, unbiased, and practical recommendations"
    },
    {
        "question": "What is the purpose of red team prompts in model evaluation?",
        "options": [
            "To identify flaws in the model’s ethical or safety handling",
            "To provide additional training data for the model",
            "To improve the speed of the recommendation engine",
            "To add variety to user queries"
        ],
        "answer": "To identify flaws in the model’s ethical or safety handling"
    },
    {
        "question": "How does the system handle user input that requests unethical actions?",
        "options": [
            "It denies the request and provides ethical guidelines.",
            "It generates a placeholder response.",
            "It logs the input for future training.",
            "It refers the query to a human operator."
        ],
        "answer": "It denies the request and provides ethical guidelines."
    },
    {
    "question": "What is an example of an edge case a model must be evaluated on?",
    "options": [
        "Handling contradictory or ambiguous user inputs",
        "Providing information on commonly known topics",
        "Responding to straightforward, factual queries",
        "Delivering recommendations within well-defined parameters"
    ],
    "answer": "Handling contradictory or ambiguous user inputs"
}
]

# Widgets for questions
quiz_widgets = []
for i, q in enumerate(questions):
    question_label = widgets.Label(value=f"Q{i+1}: {q['question']}")
    options = widgets.RadioButtons(
        options=q['options'],
        description='',
        disabled=False,
        value=None,
        layout=widgets.Layout(width='90%', height='auto')  # Ensures proper layout for longer options
    )
    quiz_widgets.append((question_label, options))

# Button to submit answers
submit_button = widgets.Button(description="Submit Answers", button_style="primary")
output = widgets.Output()

# Flag to track if the error message is already displayed
error_displayed = False

# Define button click event
def on_submit_click(b):
    global error_displayed
    # Disable the submit button
    submit_button.disabled = True
    clear_output(wait=True)
    unanswered = False
    score = 0

    # Check if all questions are answered
    for i, (label, options) in enumerate(quiz_widgets):
        if options.value is None:  # If a question is left unanswered
            unanswered = True

    with output:
        if unanswered:
            if not error_displayed:  # Only display the error if it hasn't been shown already
                error_displayed = True
                # Display error message in red
                display(widgets.HTML(
                    '<p style="color: red; font-weight: bold;">Please answer all the questions before submitting.</p>'
                ))
            submit_button.disabled = False  # Re-enable button if there's an error
        else:
            error_displayed = False  # Reset the flag if all questions are answered
            submit_button.button_style = ""  # Reset button style to default after click
            # Calculate score
            for i, (label, options) in enumerate(quiz_widgets):
                user_answer = options.value
                correct_answer = questions[i]["answer"]
                if user_answer == correct_answer:
                    score += 1
                print(f"Q{i+1}: {questions[i]['question']}")
                print(f"  - Your answer: {user_answer}")
                print(f"  - Correct answer: {correct_answer}")
                print()

            print(f"You scored {score}/{len(questions)}! ({(score / len(questions)) * 100:.2f}%)")
            
            # Show Continue or Try Again button based on score
            if score >= 0.8 * len(questions):
                continue_button = widgets.HTML(
                    '<a href="case-study-4.ipynb" style="display: inline-block; padding: 10px 15px; '
                    'background-color: #28a745; color: white; text-decoration: none; border-radius: 5px;">'
                    'Continue</a>'
                )
                display(continue_button)
            else:
                try_again_button = widgets.HTML(
                    '<a href="case-study-3.ipynb" style="display: inline-block; padding: 10px 15px; '
                    'background-color: #dc3545; color: white; text-decoration: none; border-radius: 5px;">'
                    'Score at least 80% to continue. Try Again</a>'
                )
                display(try_again_button)

# Attach event to the submit button
submit_button.on_click(on_submit_click)

# Display the quiz
for label, options in quiz_widgets:
    display(label, options)
display(submit_button, output)


Label(value='Q1: What is the primary goal of performance testing?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('To identify edge cases and test model robust…

Label(value='Q2: Which input should the model evaluate for possible ethical problems?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('Common user preferences', 'Beginner coffee d…

Label(value='Q3: What is a key task in the safety and ethical review of virtual agents?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('Testing the agent for biases in recommendati…

Label(value='Q4: What should the virtual agent avoid to maintain brand guidelines?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('Using an overly casual tone', 'Suggesting ne…

Label(value='Q5: What tool is suggested for identifying potential vulnerabilities in the AI system?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('Red teaming', 'Data augmentation', 'Performa…

Label(value='Q6: What is the expected behavior of the agent when handling ambiguous inputs?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('It should ask clarifying questions.', 'It sh…

Label(value='Q7: What criteria should the model meet to pass safety evaluations?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('Accurate, unbiased, and practical recommenda…

Label(value='Q8: What is the purpose of red team prompts in model evaluation?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('To identify flaws in the model’s ethical or …

Label(value='Q9: How does the system handle user input that requests unethical actions?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('It denies the request and provides ethical g…

Label(value='Q10: What is an example of an edge case a model must be evaluated on?')

RadioButtons(layout=Layout(height='auto', width='90%'), options=('Handling contradictory or ambiguous user inp…

Button(button_style='primary', description='Submit Answers', style=ButtonStyle())

Output()

***END CAP1*** 

[[***TODO CAP2 - Proceed to Case Study***] Next: Case Study 6](case-study-6.ipynb)
