<a href="https://colab.research.google.com/github/ffeldhaus/conversational-agents-intent-improver-agent/blob/main/Improve_Conversational_Agents_(Dialogflow_CX)_Intents_via_Reinforcement_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet google-cloud-dialogflow-cx google-genai tqdm langcodes

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import io
import re
import uuid

from google.colab import auth
from google.colab import files

from google.api_core import retry
from google.api_core.client_options import ClientOptions

from google.cloud import dialogflowcx_v3 as dialogflow

from google import genai
from google.genai import types

from pydantic import BaseModel, Field

import pandas as pd

from tqdm.auto import tqdm

from langcodes import Language

In [None]:
# @title Configuration { display-mode: "form" }

# @markdown # Dialogflow CX Agent Configuration
# @markdown The full name of the Dialogflow CX agent, including the project and location:
AGENT_NAME = "projects/ucds-testsystem/locations/europe-west3/agents/e444b62c-6b83-489b-b41c-373c62093972" # @param {type:"string"}
# @markdown The ID of the flow to be tested within the agent:
FLOW_ID = "135bf7a3-7481-4e69-923b-f9dcfc4bec6e" # @param {type:"string"}
# @markdown The ID of the page where the test will start:
PAGE_ID = "b9d110b1-d061-471a-a568-68f6198efcb3" # @param {type:"string"}
# @markdown  The language code to use for the test (e.g., "en", "de"). If left empty, the agent's default language will be used:
LANGUAGE_CODE = "" # @param {type:"string"}
# @markdown The NLU classification threshold for including alternative intents in the match results:
NLU_THRESHOLD_ALTERNATIVE_MATCHING_INTENTS = 0.0001 # @param {type:"number"}
# @markdown The proportion of the test data to be used for the final evaluation (the rest will be used for reinforcement learning):
TEST_TRAIN_SPLIT_RATE = 0.6 # @param {type:"number"}
# @markdown The number of reinforcement learning iterations to perform:
REINFORCEMENT_ITERATIONS = 2 # @param {type:"number"}
# @markdown The minimum confidence difference between the top matched intent and the second-best alternative to be considered a "good" match:
REINFORCEMENT_MINIMUM_CONFIDENCE_DIFFERENCE = 0.2 # @param {type:"number"}

# @markdown # Gemini API Configuration
# @markdown The name of the Gemini model to use for analysis:
GEMINI_MODEL = "gemini-2.5-pro" # @param {type:"string"}
# @markdown The Google Cloud project ID where the Gemini API is enabled:
GEMINI_PROJECT_ID = "emea-ccai-demo" # @param {type:"string"}
# @markdown The Google Cloud location where the Gemini API is located:
GEMINI_LOCATION = "europe-west4" # @param {type:"string"}
# @markdown The maximum number of tokens to use in a single call to the Gemini API:
MAX_TOKEN_COUNT = 1048575 # @param {type:"integer"}

# @markdown # General Configuration
# @markdown The maximum number of parallel threads to use for tasks like evaluation and Gemini analysis:
MAX_WORKERS = 8 # @param {type:"integer"}

# Extract the project ID from the agent name
PROJECT_ID = AGENT_NAME.split('/')[1]
# Extract the location from the agent name
LOCATION = AGENT_NAME.split('/')[3]

In [None]:
# @title Authenticate

auth.authenticate_user(project_id=PROJECT_ID)

In [None]:
# @title Initialize

# Initialize genai Client for Gemini usage
genai_client = genai.Client(
    vertexai=True,
    project=GEMINI_PROJECT_ID,
    location=GEMINI_LOCATION,
    http_options=types.HttpOptions(
        retry_options=types.HttpRetryOptions(
            attempts=3,
            initial_delay=3,
            exp_base=2,
            max_delay=60,
            http_status_codes=[429, 500, 502, 503, 504]
        )
    )
)

# Initialize Dialogflow CX Clients with the correct endpoint

if LOCATION != "global":
  api_endpoint=f"{LOCATION}-dialogflow.googleapis.com"
else:
  api_endpoint=f"dialogflow.googleapis.com"

client_options = ClientOptions(api_endpoint=api_endpoint)
agents_client = dialogflow.AgentsClient(client_options=client_options)
flows_client = dialogflow.FlowsClient(client_options=client_options)
pages_client = dialogflow.PagesClient(client_options=client_options)
intents_client = dialogflow.IntentsClient(client_options=client_options)
entity_types_client = dialogflow.EntityTypesClient(client_options=client_options)
sessions_client = dialogflow.SessionsClient(client_options=client_options)

In [None]:
# @title Get Dialogflow CX resources

# Read the agent
agent = agents_client.get_agent(name=AGENT_NAME)
print(f"Agent read successfully: {agent.display_name}")

# fallback to default language code
if not LANGUAGE_CODE:
  LANGUAGE_CODE = agent.default_language_code

# get language name
language = Language.get(LANGUAGE_CODE).language_name("en")
print(f"Using Language: {language} ({LANGUAGE_CODE})")

# Get Flow
flow = flows_client.get_flow(name=f"{AGENT_NAME}/flows/{FLOW_ID}")
print(f"Flow read successfully: {flow.display_name}")

# Get Page
page = pages_client.get_page(name=f"{AGENT_NAME}/flows/{FLOW_ID}/pages/{PAGE_ID}")
print(f"Page read successfully: {page.display_name}")

# Get Intents
intents = list(intents_client.list_intents(parent=AGENT_NAME))
print(f"Intents read successfully: {len(intents)}")

# Get Entity Types
entity_types = list(entity_types_client.list_entity_types(parent=AGENT_NAME))
print(f"Entity types read successfully: {len(entity_types)}")

In [None]:
# @title Check and update NLU threshold and training mode for the flow

if NLU_THRESHOLD_ALTERNATIVE_MATCHING_INTENTS != 0 and flow.nlu_settings.classification_threshold != NLU_THRESHOLD_ALTERNATIVE_MATCHING_INTENTS:
    flow.nlu_settings.classification_threshold = NLU_THRESHOLD_ALTERNATIVE_MATCHING_INTENTS
    flows_client.update_flow(request={"flow": flow})
    print(f"NLU threshold updated for flow {flow.display_name} to {NLU_THRESHOLD_ALTERNATIVE_MATCHING_INTENTS}")
else:
    print(f"NLU threshold for flow {flow.display_name} is already {flow.nlu_settings.classification_threshold} or NLU_THRESHOLD_ALTERNATIVE_MATCHING_INTENTS is 0. No update needed.")

if flow.nlu_settings.model_training_mode != dialogflow.types.NluSettings.ModelTrainingMode.MODEL_TRAINING_MODE_MANUAL:
    flow.nlu_settings.model_training_mode = dialogflow.types.NluSettings.ModelTrainingMode.MODEL_TRAINING_MODE_MANUAL
    flows_client.update_flow(request={"flow": flow})
    print(f"Training mode updated for flow {flow.display_name} to MANUAL")
else:
    print(f"Training mode for flow {flow.display_name} is already MANUAL. No update needed.")

In [None]:
# @title Upload XLSX or CSV file(s) with test sentences and matching intents

uploaded_files = files.upload()

test_sentences = {}

# Create a dictionary to map intent display names to UUIDs
intent_display_name_to_uuid = {intent.display_name: intent.name.split('/')[-1] for intent in intents}

for file_name, file_content in uploaded_files.items():
    print(f"Processing file: {file_name}")
    try:
        if file_name.endswith('.csv'):
            df = pd.read_csv(io.BytesIO(file_content), on_bad_lines='warn')
        elif file_name.endswith('.xlsx'):
            df = pd.read_excel(io.BytesIO(file_content))
        else:
            print(f"Skipping unsupported file type: {file_name}")
            continue

        # Assume the first row is header and the data starts from the second row
        # Rename columns for easier access
        df.columns = ['Intent', 'Test']


        # Process data: Assume column 1 is intent, column 2 is test sentence
        for index, row in df.iterrows():
            intent_from_file = str(row['Intent']).strip()
            sentence = str(row['Test']).strip()

            if not sentence: # Skip if sentence is empty
                continue

            # Check if intent is a UUID
            if re.match(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', intent_from_file):
                intent_uuid = intent_from_file
            # Check if intent is a display name and exists in the agent's intents
            elif intent_from_file in intent_display_name_to_uuid:
                intent_uuid = intent_display_name_to_uuid[intent_from_file]
            else:
                print(f"Ignoring row {index} in {file_name}: Invalid intent '{intent_from_file}'. Not a valid UUID or a known display name.")
                continue

            if intent_uuid not in test_sentences:
                test_sentences[intent_uuid] = []
            test_sentences[intent_uuid].append(sentence)

    except Exception as e:
        print(f"Error processing file {file_name}: {e}")

print(f"\nProcessed {len(test_sentences)} intents with a total of {sum(len(sentences) for sentences in test_sentences.values())} test sentences.")

In [None]:
# @title Function to match intent for a given text

@retry.Retry(
    initial=60,
    maximum=300,
    timeout=600,
)
def match_intent(agent_name, text, session_id, flow_id, page_id, timeout=10):
    """Returns the result of detect intent with texts as inputs.

    Using the same `session_id` between requests allows continuation of the conversation."""
    session_path = f"{agent_name}/sessions/{session_id}"
    current_page_path = f"{agent_name}/flows/{flow_id}/pages/{page_id}" # Correct format for currentPage

    text_input = dialogflow.TextInput(text=text)
    query_input = dialogflow.QueryInput(
        text=text_input,
        language_code="en-US"
        # Remove context_paths as it's not the correct way to specify the starting page
    )
    query_parameters = dialogflow.QueryParameters(
        current_page=current_page_path # Specify the starting page using currentPage
    )

    # No need for try-except here because @retry handles exceptions
    response = sessions_client.match_intent(
        request={
            "session": session_path,
            "query_input": query_input,
            "query_params": query_parameters
        },
        timeout=timeout # Add timeout
    )
    return response

# Task
Implement Reinforcement Learning using gemini to analyze an intent, intent description, intent training phrases, entities mentioned in the intent and have Gemini make suggestions what to improve to increase the intent matching rate of the test data. Gemini should understand that the Intent matching is done using a BERT NLU trained specifically with the Intent Training Phrases and Entities. Ensure that only a split of the test data is used and always the same (e.g. for split 0.3 use the first 3 for verification only and the later 7 for reinforcement learning and verification). The recommendations should be applied and retested. Then gemini should analyze the results and improvements and make further suggestions, up to REINFORCEMENT_ITERATIONS iterations. Ultimately a report should be generated on what improvements where achieved and guidance on what additionally could / should be changed for further improvements. To call Gemini only use the model name without a project or path, e.g. "gemini-2.5-pro" the project and location where already specified during client initialization.

## Split test data

### Subtask:
Split the `test_sentences` data into training and testing sets based on the `TEST_TRAIN_SPLIT_RATE`. The training set will be used for reinforcement learning with Gemini, and the testing set will be used for final evaluation.


**Reasoning**:
Import the necessary function and split the data into training and testing sets, then convert them back to the required dictionary format.



In [None]:
from sklearn.model_selection import train_test_split
import math

# Initialize empty dictionaries for training and testing sets
train_sentences = {}
test_sentences_eval = {}

# Total counts for final printout
total_train_count = 0
total_test_count = 0

for intent_uuid, sentences in test_sentences.items():
    if len(sentences) < 2:
        # If only one sentence, add it to the training set
        if intent_uuid not in train_sentences:
            train_sentences[intent_uuid] = []
        train_sentences[intent_uuid].extend(sentences)
        total_train_count += len(sentences)
        continue

    # Calculate the number of test samples, ensuring at least one
    test_size = math.ceil(len(sentences) * TEST_TRAIN_SPLIT_RATE)

    # Split the sentences for the current intent
    train_list, test_list = train_test_split(sentences, test_size=test_size, random_state=42)

    # Add the results to the dictionaries
    if train_list:
        if intent_uuid not in train_sentences:
            train_sentences[intent_uuid] = []
        train_sentences[intent_uuid].extend(train_list)
        total_train_count += len(train_list)

    if test_list:
        if intent_uuid not in test_sentences_eval:
            test_sentences_eval[intent_uuid] = []
        test_sentences_eval[intent_uuid].extend(test_list)
        total_test_count += len(test_list)

print(f"Total sentences: {total_train_count + total_test_count}")
print(f"Training sentences: {total_train_count}")
print(f"Testing sentences for evaluation: {total_test_count}")

## Reinforcement Learning Loop

### Subtask: Select Training Data

Choose a subset of the training data from `train_sentences` for the current reinforcement learning iteration.

**Reasoning**:
Select a subset of the training data for the first reinforcement learning iteration. For simplicity in this first iteration, we will use all the training data.

In [None]:
# For the first iteration, use all training sentences
current_train_sentences = train_sentences

print(f"Selected {sum(len(sentences) for sentences in current_train_sentences.values())} training sentences for the current iteration.")

### Subtask: Gemini Analysis

Use Gemini to analyze the selected training data, the corresponding intent definition (including training phrases and entities), and the NLU model's behavior. Gemini should identify areas for improvement in the intent definition to increase matching rates.

**Reasoning**:
Iterate through the training sentences, retrieve the corresponding intent details (display name, training phrases, and entities), and use Gemini to analyze this information along with the evaluation results to generate suggestions for improvement.

In [None]:
# Define the Pydantic schema for the Gemini output
class IntentSuggestions(BaseModel):
    analysis: str = Field(description="Detailed analysis of the intent and suggestions for improvement.")
    phrases_to_remove: list[int] = Field(description="List of training phrase indices to remove.")
    phrases_to_add: list[str] = Field(description="List of new training phrases to add, including parameter annotations in the form [parameter text](parameter_id) with 'parameter text' included in the list of entities or their synonym of the entity type corresponding to the parameter and parameter_id matching one of the existing parameters of the intent.")


# Function to get intent details
def get_intent_details(intent_uuid, intents, entity_types):
    intent = next((intent for intent in intents if intent.name.split('/')[-1] == intent_uuid), None)
    if not intent:
        return None, None, None, None, None

    display_name = intent.display_name
    description = intent.description

    training_phrases = []
    for tp in intent.training_phrases:
        phrase_str = ""
        for part in tp.parts:
            if part.parameter_id:
                phrase_str += f"[{part.text}]({part.parameter_id})"
            else:
                phrase_str += part.text
        training_phrases.append(phrase_str)

    parameters = []
    for p in intent.parameters:
        parameters.append({
            "id": p.id,
            "entity_type": p.entity_type.split('/')[-1]
        })

    # Extract entity types mentioned in training phrases
    mentioned_entity_types_details = []
    for parameter in intent.parameters:
        entity_type_name = parameter.entity_type.split('/')[-1]
        entity_type_obj = next((et for et in entity_types if et.name.split('/')[-1] == entity_type_name), None)
        if entity_type_obj:
            entities_with_synonyms = []
            for entity in entity_type_obj.entities:
                entities_with_synonyms.append(f"{entity.value}: {', '.join(entity.synonyms)}")
            mentioned_entity_types_details.append({
                "parameter_id": parameter.id,
                "display_name": entity_type_obj.display_name,
                "entities": entities_with_synonyms
            })

    return display_name, description, training_phrases, parameters, mentioned_entity_types_details


def analyze_intent_with_gemini(intent_uuid, sentences, evaluation_results, intents, entity_types):
    display_name, description, training_phrases, parameters, entities_details = get_intent_details(intent_uuid, intents, entity_types)

    if not display_name:
        print(f"Could not find intent with UUID: {intent_uuid}. Skipping.")
        return intent_uuid, None

    # Get relevant evaluation results for this intent's training sentences
    relevant_eval_results = {sentence: result for sentence, result in evaluation_results.items() if result['expected_intent_uuid'] == intent_uuid and sentence in sentences}

    training_phrases_string = ""
    for i, phrase in enumerate(training_phrases):
        training_phrases_string += f"{i}: {phrase}\n"

    all_entities_details = entities_details[:]
    # Prepare the prompt for Gemini
    prompt = f"""You are an expert NLU analyst tasked with optimizing an intent within a BERT-based Natural Language Understanding (NLU) system.

Model Context: Because this is a BERT-based model, it relies on deep contextual understanding, sentence structure, and semantic relationships (embeddings), rather than simple keyword matching. Your goal is to refine the training data to sharpen the semantic boundaries of the target intent, improving its precision and recall (F1 score), and reducing confusion with other intents.

Some training phrases may contain intentional spelling or grammar errors to capture ASR (Automatic Speech Recognition) transcription errors.

Analyze the following intent and related data and provide concrete recommendations.

## Analysis Objectives

1.  **Identify Weaknesses & Coverage Gaps:** Determine why the target intent is not matching correctly. Is the training data too narrow (poor recall), too broad (poor precision), lacking variety, or failing to cover the scope defined in the description?
2.  **Analyze Overlaps (Confusion Analysis):** For each Alternative Matched Intent, explain the *root cause* of the confusion. Focus on semantic similarities, shared vocabulary, or ambiguous training phrases in the Target Intent that cause the BERT model to struggle with differentiation.
3.  **Entity Analysis:** Analyze if the existing entity types are being used effectively. You may suggest changes to the entity type definitions in the analysis, but recommendations (ADD/REMOVE) must use the entity types as they currently exist.

## Recommendation Guidelines

Your recommendations (ADD/REMOVE) must adhere strictly to these rules:

1.  **Scope Management:** Do **not** expand the scope of the Target Intent beyond its description. Recommendations should only sharpen the existing scope and reduce ambiguity.
2.  **Entity Constraints:** You must **not** add new entities or entity types in the training phrases. All new training phrases must only use the provided Mentioned Entity Types.
3.  **Language Requirements:** **Crucial:** All analysis, explanations, and suggested training phrases must be in **{language}**.
4.  **Natural Language and Stop Words (Crucial for BERT):**
    *   Prioritize natural, idiomatic, conversational **{language}**.
    *   **Include common {language} stop words** (articles, possessive pronouns, prepositions) where they are essential for a natural-sounding, grammatically correct utterance. BERT requires these for contextual understanding.
    *   Avoid *unnecessary* filler words (e.g., "umm," "please"), but do not strip necessary stop words.
5.  **Handling Existing Errors:** Training phrases in the existing dataset that contain only one word, incomplete words, transcription errors (ASR), or spelling/grammar mistakes must **not** be marked for removal (they provide robustness), unless they are the direct cause of severe, unresolvable cross-intent confusion. You may ADD the corrected or more complete versions of these phrases.
6.  **Quantity Guideline:** If the intent already has a large number of phrases (e.g., >100), be highly selective with ADD recommendations, focusing only on phrases that resolve specific overlaps or critical gaps.

## Recommendation Types

*   **ADD:**
    *   Suggest new phrases that specifically help disambiguate the Target Intent from the Alternative Matched Intents.
    *   Increase variety in sentence structure (questions, commands, statements) and coverage of real-world user expressions *within the scope*.
    *   Explain the rationale for the addition.
*   **REMOVE:**
    *   Identify phrases (by index) that are highly ambiguous, out of scope, or directly cause unresolvable confusion with Alternative Matched Intents.
    *   Explain the rationale for removal.
    *   *Note: To update a phrase, you must REMOVE the old index and ADD the new version.*

# Intent to Analyze

Intent Display Name: {display_name}
Description: {description}
Parameters: {parameters}
Training Phrases (with parameter annotations):
{training_phrases_string}

# Test Sentences and Matching Results (from evaluation):
"""
    if relevant_eval_results:
        for sentence, result in relevant_eval_results.items():
            prompt += f"""
Sentence: {sentence}
Expected Intent: {display_name}
Matched Intent: {result['matched_intent_display_name']}
Matched Intent Confidence: {result['matched_intent_confidence']}
"""
            if result['alternative_matches']:
                for i, alt_match in enumerate(result['alternative_matches']):
                    prompt += f"Alternative Match #{i+1}: {alt_match['display_name']} (Confidence: {alt_match['confidence']})\n"

    else:
        prompt += "No relevant evaluation results found for this intent in the training data.\n"

    current_tokens = genai_client.models.count_tokens(model=GEMINI_MODEL, contents=prompt).total_tokens

    # Collect and deduplicate alternative intents
    alternative_intents_info = {}
    if relevant_eval_results:
        for result in relevant_eval_results.values():
            for alt_match in result.get('alternative_matches', []):
                alt_intent_uuid = alt_match.get('intent_id')
                if alt_intent_uuid and alt_intent_uuid != intent_uuid and alt_intent_uuid not in alternative_intents_info:
                    alt_display_name, alt_description, alt_training_phrases, alt_parameters, alt_entities_details = get_intent_details(alt_intent_uuid, intents, entity_types)
                    if alt_display_name:
                        all_entities_details.extend(alt_entities_details)
                        alt_training_phrases_string = ""
                        for phrase in alt_training_phrases:
                            alt_training_phrases_string += f"- {phrase}\n"
                        alternative_intents_info[alt_intent_uuid] = {
                            "display_name": alt_display_name,
                            "description": alt_description,
                            "parameters": alt_parameters,
                            "training_phrases": alt_training_phrases_string
                        }

    if alternative_intents_info:
        prompt += "\n# Overlapping Intents\n"
        for alt_intent_uuid, alt_info in alternative_intents_info.items():
            alternative_intent_prompt = f"\nAlternative Matched Intent Display Name: ({alt_info['display_name']})\n"
            alternative_intent_prompt += f"Alternative Matched Description: {alt_info['description']}\n"
            alternative_intent_prompt += f"Alternative Matched Parameters: {alt_info['parameters']}\n"
            alternative_intent_prompt += f"Alternative Matched Training Phrases:\n{alt_info['training_phrases']}"
            alternative_intent_token_count = genai_client.models.count_tokens(model=GEMINI_MODEL, contents=alternative_intent_prompt).total_tokens
            if current_tokens + alternative_intent_token_count > MAX_TOKEN_COUNT:
                break
            else:
                current_tokens += alternative_intent_token_count
                prompt += alternative_intent_prompt


    # Deduplicate and add entities information
    unique_entities = {v['display_name']:v for v in all_entities_details}.values()
    if unique_entities:
        prompt += "\n# Entity Types and Entities used in training phrases:\n"
        for entity_detail in unique_entities:
            entities_string = f"\nParameter ID: {entity_detail['parameter_id']}\nEntity Type: {entity_detail['display_name']}\nEntities:\n"
            for entity in entity_detail['entities']:
                entities_string += f"- {entity}\n"
            entities_token_count = genai_client.models.count_tokens(model=GEMINI_MODEL, contents=entities_string).total_tokens
            if current_tokens + entities_token_count > MAX_TOKEN_COUNT:
                break
            else:
                prompt += entities_string
                current_tokens += entities_token_count

    try:
        # Call Gemini API
        response = genai_client.models.generate_content(
            model=GEMINI_MODEL,
            contents=prompt,
            config=types.GenerateContentConfig(
                response_mime_type='application/json',
                response_schema=IntentSuggestions,
            )
        )
        return intent_uuid, response.text
    except Exception as e:
        print(f"Error processing intent {display_name} ({intent_uuid}): {e}")
        return intent_uuid, f"Error generating suggestions: {e}"


def analyze_intents_with_gemini(current_train_sentences, evaluation_results, intents, entity_types):
    gemini_suggestions = {}
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_intent = {
            executor.submit(
                analyze_intent_with_gemini,
                intent_uuid,
                sentences,
                evaluation_results,
                intents,
                entity_types
            ): intent_uuid
            for intent_uuid, sentences in current_train_sentences.items()
        }

        for future in tqdm(as_completed(future_to_intent), total=len(future_to_intent), desc="Analyzing Intents with Gemini"):
            intent_uuid, suggestions = future.result()
            if suggestions:
                gemini_suggestions[intent_uuid] = suggestions

    return gemini_suggestions

## Reinforcement learning

In [None]:
# @title Main Reinforcement Learning Loop

def evaluate_test_data(test_sentences_to_evaluate, agent_name, session_id, flow_id, page_id):
    """Evaluates a set of test sentences and returns the results and accuracy."""
    evaluation_results = {}
    correct_predictions = 0
    total_predictions = 0

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_sentence_eval = {
            executor.submit(match_intent, agent_name, sentence, session_id, flow_id, page_id): (sentence, intent_uuid)
            for intent_uuid, sentences in test_sentences_to_evaluate.items() for sentence in sentences
        }

        for future in tqdm(as_completed(future_to_sentence_eval), total=len(future_to_sentence_eval), desc="Evaluating Sentences"):
            sentence, expected_intent_uuid = future_to_sentence_eval[future]
            total_predictions += 1
            try:
                response = future.result()
                if response and response.matches:
                    matched_intent_id = response.matches[0].intent.name.split('/')[-1]
                    matched_intent_display_name = response.matches[0].intent.display_name
                    matched_intent_confidence = response.matches[0].confidence
                    alternative_matches = [
                        {"intent_id": m.intent.name.split('/')[-1], "display_name": m.intent.display_name, "confidence": m.confidence}
                        for m in response.matches[1:]
                    ]
                    evaluation_results[sentence] = {
                        "expected_intent_uuid": expected_intent_uuid,
                        "matched_intent_uuid": matched_intent_id,
                        "matched_intent_display_name": matched_intent_display_name,
                        "matched_intent_confidence": matched_intent_confidence,
                        "alternative_matches": alternative_matches
                    }
                    if matched_intent_id == expected_intent_uuid:
                        correct_predictions += 1
                else:
                    evaluation_results[sentence] = {
                        "expected_intent_uuid": expected_intent_uuid,
                        "matched_intent_uuid": "No match",
                        "matched_intent_display_name": "No match",
                        "matched_intent_confidence": 0.0,
                        "alternative_matches": []
                    }
            except Exception as exc:
                evaluation_results[sentence] = {
                    "expected_intent_uuid": expected_intent_uuid,
                    "matched_intent_uuid": f"Error: {exc}",
                    "matched_intent_display_name": f"Error: {exc}",
                    "matched_intent_confidence": 0.0,
                    "alternative_matches": []
                }
                print(f"Evaluation sentence '{sentence}' generated an exception: {exc}")

    accuracy = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0
    return evaluation_results, accuracy


# --- Baseline Evaluation ---
print("--- Running Baseline Evaluation ---")
baseline_session_id = f"baseline-eval-{uuid.uuid4()}"
baseline_evaluation_results, baseline_accuracy = evaluate_test_data(test_sentences_eval, agent.name, baseline_session_id, FLOW_ID, PAGE_ID)
print(f"\nBaseline Accuracy: {baseline_accuracy:.2f}%")

evaluation_results = baseline_evaluation_results.copy()
accuracies = [baseline_accuracy]


for i in range(REINFORCEMENT_ITERATIONS):
    print(f"\n--- Starting Reinforcement Iteration {i+1}/{REINFORCEMENT_ITERATIONS} ---")

    # 1. Identify problematic intents in the test set
    problematic_intent_uuids = set()
    for sentence, result in evaluation_results.items():
        # Check if the sentence is in the test set
        is_in_test_set = any(sentence in sentences for sentences in test_sentences_eval.values())

        if is_in_test_set:
            if result['matched_intent_uuid'] != result['expected_intent_uuid']:
                problematic_intent_uuids.add(result['expected_intent_uuid'])
            elif result['alternative_matches']:
                confidence_difference = result['matched_intent_confidence'] - result['alternative_matches'][0]['confidence']
                if confidence_difference < REINFORCEMENT_MINIMUM_CONFIDENCE_DIFFERENCE:
                    problematic_intent_uuids.add(result['expected_intent_uuid'])

    if not problematic_intent_uuids:
        print("No problematic intents found. Stopping reinforcement learning.")
        break

    print(f"Found {len(problematic_intent_uuids)} problematic intents to analyze.")

    # Create a dictionary of the problematic sentences to pass to Gemini
    problematic_sentences_for_gemini = {}
    for intent_uuid in problematic_intent_uuids:
        if intent_uuid in test_sentences_eval:
            problematic_sentences_for_gemini[intent_uuid] = test_sentences_eval[intent_uuid]


    # 2. Analyze with Gemini
    gemini_suggestions = analyze_intents_with_gemini(
        problematic_sentences_for_gemini,
        evaluation_results,
        intents,
        entity_types
    )

    # 3. Apply Gemini's suggestions
    @retry.Retry(
        initial=60,
        maximum=300,
        timeout=600,
    )
    def update_intent_with_retry(intent_to_update):
      intents_client.update_intent(intent=intent_to_update)

    for intent_uuid, suggestions_json in tqdm(gemini_suggestions.items(), desc="Applying Gemini Suggestions"):
        if suggestions_json.startswith("Error generating suggestions:"):
            tqdm.write(f"Skipping intent {intent_uuid} due to generation error: {suggestions_json}")
            continue
        try:
            suggestions = IntentSuggestions.model_validate_json(suggestions_json)
            intent_to_update = next((intent for intent in intents if intent.name.split('/')[-1] == intent_uuid), None)

            if not intent_to_update:
                tqdm.write(f"Could not find intent with UUID: {intent_uuid} to apply suggestions. Skipping.")
                continue

            tqdm.write(f"\nApplying suggestions for intent: {intent_to_update.display_name}")

            # Remove phrases
            phrases_to_remove_indices = sorted(suggestions.phrases_to_remove, reverse=True)
            if phrases_to_remove_indices:
                tqdm.write(f"Removing {len(phrases_to_remove_indices)} phrases...")
                for index in phrases_to_remove_indices:
                    if 0 <= index < len(intent_to_update.training_phrases):
                        del intent_to_update.training_phrases[index]

            # Add phrases
            if suggestions.phrases_to_add:
                tqdm.write(f"Adding {len(suggestions.phrases_to_add)} new phrases...")
                for phrase_text in suggestions.phrases_to_add:
                    # Check for duplicates before adding
                    if any(phrase_text == "".join([p.text for p in tp.parts]) for tp in intent_to_update.training_phrases):
                        tqdm.write(f"Skipping duplicate phrase: {phrase_text}")
                        continue

                    new_training_phrase = dialogflow.Intent.TrainingPhrase(repeat_count=1)
                    parts = re.split(r'(\[[^\]]+\]\([^\)]+\))', phrase_text)
                    skip_phrase = False
                    for part in parts:
                        match = re.match(r'\[([^\]]+)\]\(([^\)]+)\)', part)
                        if match:
                            parameter_id = match.group(2)
                            if not any(p.id == parameter_id for p in intent_to_update.parameters):
                                tqdm.write(f"Skipping phrase with undefined parameter: {phrase_text}")
                                skip_phrase = True
                                break
                    if skip_phrase:
                        continue

                    for part in parts:
                        match = re.match(r'\[([^\]]+)\]\(([^\)]+)\)', part)
                        if match:
                            text = match.group(1)
                            parameter_id = match.group(2)
                            new_training_phrase.parts.append(dialogflow.Intent.TrainingPhrase.Part(text=text, parameter_id=parameter_id))
                        elif part:
                            new_training_phrase.parts.append(dialogflow.Intent.TrainingPhrase.Part(text=part))
                    intent_to_update.training_phrases.append(new_training_phrase)

            # Ensure all training phrases have repeat_count
            for tp in intent_to_update.training_phrases:
                if not 'repeat_count' in tp or not tp.repeat_count or tp.repeat_count < 1:
                    tp.repeat_count = 1

            # Update the intent
            update_intent_with_retry(intent_to_update)
            tqdm.write(f"Successfully updated intent: {intent_to_update.display_name}")

        except Exception as e:
            tqdm.write(f"Error applying suggestions for intent {intent_uuid}: {e}")
            raise e

    # 4. Train the flow
    print("\nTraining the flow...")
    operation = flows_client.train_flow(name=flow.name)
    print("Waiting for training to complete...")
    operation.result(timeout=7200)
    print("Flow training completed successfully.")


    # 5. Re-evaluate all test sentences
    print("\nRe-evaluating all test sentences after applying suggestions...")
    session_id = f"re-eval-{i+1}-{uuid.uuid4()}"
    evaluation_results, accuracy = evaluate_test_data(test_sentences_eval, agent.name, session_id, FLOW_ID, PAGE_ID)
    accuracies.append(accuracy)


    # 6. Calculate and print improvement summary for the iteration
    print(f"\n--- Iteration {i+1} Summary ---")
    print(f"Baseline Accuracy: {accuracies[0]:.2f}%")
    for j, acc in enumerate(accuracies[1:]):
        print(f"Accuracy after Iteration {j+1}: {acc:.2f}%")


print("\n--- Reinforcement Learning Finished ---")

## Final Report and Summary

This report summarizes the results of the reinforcement learning process. It includes an analysis of the initial and final evaluation results, a summary of the changes made by Gemini, and recommendations for further improvements.

In [None]:
# @title Generate Final Report

def generate_final_report(initial_results, final_results, all_gemini_suggestions):
    """Generates a final report comparing initial and final results."""

    report = "<h1>Reinforcement Learning Final Report</h1>"

    # --- Overall Summary ---
    report += "<h2>Overall Summary</h2>"
    initial_correct = sum(1 for r in initial_results.values() if r['matched_intent_uuid'] == r['expected_intent_uuid'])
    final_correct = sum(1 for r in final_results.values() if r['matched_intent_uuid'] == r['expected_intent_uuid'])
    total_sentences = len(initial_results)
    initial_accuracy = (initial_correct / total_sentences) * 100
    final_accuracy = (final_correct / total_sentences) * 100

    report += f"<p><b>Initial Accuracy:</b> {initial_accuracy:.2f}% ({initial_correct}/{total_sentences})</p>"
    report += f"<p><b>Final Accuracy:</b> {final_accuracy:.2f}% ({final_correct}/{total_sentences})</p>"
    report += f"<p><b>Accuracy Improvement:</b> {final_accuracy - initial_accuracy:.2f}%</p>"


    # --- Detailed Comparison ---
    report += "<h2>Detailed Comparison of Problematic Sentences</h2>"
    report += "<table border='1'><tr><th>Sentence</th><th>Expected Intent</th><th>Initial Match (Confidence)</th><th>Final Match (Confidence)</th><th>Status</th></tr>"

    for sentence, initial_r in initial_results.items():
        final_r = final_results[sentence]
        initial_match_str = f"{initial_r['matched_intent_display_name']} ({initial_r['matched_intent_confidence']:.2f})"
        final_match_str = f"{final_r['matched_intent_display_name']} ({final_r['matched_intent_confidence']:.2f})"
        expected_intent_display_name = next((intent.display_name for intent in intents if intent.name.split('/')[-1] == initial_r['expected_intent_uuid']), "Unknown Intent")


        status = "✅ Correct"
        if initial_r['matched_intent_uuid'] != initial_r['expected_intent_uuid']:
            if final_r['matched_intent_uuid'] == final_r['expected_intent_uuid']:
                status = "✔️ Fixed"
            else:
                status = "❌ Still Incorrect"

        if initial_r['matched_intent_uuid'] != final_r['matched_intent_uuid'] or status != "✅ Correct":
             report += f"<tr><td>{sentence}</td><td>{expected_intent_display_name}</td><td>{initial_match_str}</td><td>{final_match_str}</td><td>{status}</td></tr>"

    report += "</table>"

    # --- Gemini's Changes ---
    report += "<h2>Summary of Gemini's Changes</h2>"
    for intent_uuid, suggestions_list in all_gemini_suggestions.items():
        intent_display_name = next((intent.display_name for intent in intents if intent.name.split('/')[-1] == intent_uuid), "Unknown Intent")
        report += f"<h3>Intent: {intent_display_name} ({intent_uuid})</h3>"
        for i, suggestions_json in enumerate(suggestions_list):
            report += f"<h4>Iteration {i+1}</h4>"
            try:
                suggestions = IntentSuggestions.model_validate_json(suggestions_json)
                report += "<b>Analysis:</b>"
                report += f"<p>{suggestions.analysis}</p>"
                if suggestions.phrases_to_add:
                    report += "<b>Added Phrases:</b><ul>"
                    for phrase in suggestions.phrases_to_add:
                        report += f"<li>{phrase}</li>"
                    report += "</ul>"
                if suggestions.phrases_to_remove:
                    report += "<b>Removed Phrase Indices:</b><ul>"
                    for index in suggestions.phrases_to_remove:
                        report += f"<li>{index}</li>"
                    report += "</ul>"
            except Exception as e:
                report += f"<p>Error parsing suggestions: {e}</p>"


    return report

# Store the initial results before the loop
initial_evaluation_results = baseline_evaluation_results.copy()
all_gemini_suggestions = {}

# Modify the main loop to store suggestions
for i in range(REINFORCEMENT_ITERATIONS):
    # ... (rest of the loop code from the previous cell)

    # Store Gemini's suggestions for the report
    for intent_uuid, suggestions in gemini_suggestions.items():
        if intent_uuid not in all_gemini_suggestions:
            all_gemini_suggestions[intent_uuid] = []
        all_gemini_suggestions[intent_uuid].append(suggestions)

    # ... (rest of the loop code from the previous cell)


final_report_html = generate_final_report(initial_evaluation_results, evaluation_results, all_gemini_suggestions)

from IPython.display import HTML
display(HTML(final_report_html))