---
### Hi! My name is Lucas Pereira, and I'm the creator of this notebook. It's great to have you here!

You can find the original project on my GitHub and connect with me on LinkedIn.

* **Original notebook:** [https://github.com/dsandux/AB-Test-Toolkit](https://github.com/dsandux/AB-Test-Toolkit)
* **LinkedIn:** [https://www.linkedin.com/in/lucaspereira](https://www.linkedin.com/in/lucaspereira)
---

# A/B Test Analysis with Bayesian Statistics - Which options is the best?
This notebook performs an end-to-end analysis of A/B test results using Bayesian statistical methods. The objective is to go beyond the traditional "p-value" to calculate the probability of each variant being the best and to quantify the expected risk associated with choosing one variant over another.

#### 1. Setup

Here is where we install all the libraries needed to run this test.


In [None]:
# --- Install required packages ---
!pip install -q pandas numpy scipy matplotlib seaborn ipywidgets tabulate
!pip install -q google-generativeai python-pptx
!pip install langchain faiss-cpu sentence-transformers tiktoken
!pip install sentence-transformers
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


# --- Import Libraries for Data Analysis & Visualization ---
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter

# --- Import Libraries for Interactive Widgets & Display ---
import ipywidgets as widgets
from IPython.display import display, clear_output, Markdown, FileLink
import os
import io
from datetime import datetime

# --- Import New Libraries for GenAI and Presentations ---
import google.generativeai as genai
from pptx import Presentation
from pptx.util import Inches

print("✅ Libraries loaded successfully. You can now proceed to the Control Panel.")

#### 2. Upload files and set parameters

Run the cell below to see the upload file and settings form. The default settings are better for most of cases (unless you have a specific need and knows what the options means). You can just upload your Excel file and run the cell.

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from datetime import datetime
import os
import io

# --- Define Layouts for a cleaner and more professional look ---
# This helps align widgets and give them consistent sizing and spacing.
wide_layout = widgets.Layout(width='98%', margin='5px 0')
button_layout = widgets.Layout(width='250px', margin='15px 0 0 0')

# --- Create Interactive Widgets ---

# File Uploader Widget
uploader = widgets.FileUpload(
    accept='.xlsx',
    description='Upload Data File',
    button_style='info',
    tooltip='Click to upload your A/B test data in .xlsx format',
    layout=wide_layout
)

# Bayesian Prior Selector Widget
prior_selector = widgets.Dropdown(
    options=[
        ('Jeffreys (Recommended for most tests)', (0.5, 0.5)),
        ('Uniform (Neutral / Uninformative)', (1.0, 1.0))
    ],
    value=(0.5, 0.5),
    description='Bayesian Prior:',
    style={'description_width': 'initial'},
    tooltip='Choose the starting assumption for the model.',
    layout=wide_layout
)

# Risk Threshold Slider Widget
risk_slider = widgets.FloatSlider(
    value=0.01,
    min=0.005,
    max=0.05,
    step=0.005,
    description='Risk Tolerance:',
    style={'description_width': 'initial'},
    readout_format='.2%',
    tooltip='Lower = more cautious. Higher = more aggressive.',
    layout=wide_layout
)

# --- Add Button and Output Area for Feedback ---

# Button to confirm the upload and parameter settings
confirm_button = widgets.Button(
    description="Confirm Selections",
    button_style='success',
    tooltip='Click to confirm your file and settings',
    icon='check',
    layout=button_layout
)

# Output widget to display feedback messages (success or error)
output_area = widgets.Output()

# --- Define the logic for the button click ---
def on_confirm_button_clicked(b):
    with output_area:
        clear_output(wait=True) # Clear previous messages

        # Add a try-except block for robust error handling
        try:
            # Check if a file has been uploaded
            if not uploader.value:
                print("❌ Error: Please upload a data file before confirming.")
                return

            # Access the uploaded file's data and name
            uploaded_file_dict = uploader.value[0]
            original_name = uploaded_file_dict['name']

            # Create the new filename with the current date
            date_str = datetime.now().strftime("%Y-%m-%d")
            base_name, extension = os.path.splitext(original_name)
            new_filename_with_date = f"{base_name}_{date_str}{extension}"

            # Store the data and new filename in global variables
            global uploaded_data_content, confirmed_filename
            uploaded_data_content = io.BytesIO(uploaded_file_dict['content'])
            confirmed_filename = new_filename_with_date

            # Provide success feedback to the user
            print(f"✅ Success! File '{original_name}' is ready for analysis.")
            print(f"   It will be referred to as '{confirmed_filename}' in this session.")
            print("\nYou may now proceed to the next cell.")

        except Exception as e:
            # If any error occurs, print it clearly for debugging.
            print("❌ An unexpected error occurred. Please check the details below:")
            print(f"   Error Type: {type(e).__name__}")
            print(f"   Error Details: {e}")


# Link the function to the button's click event
confirm_button.on_click(on_confirm_button_clicked)

# --- Display the final Control Panel using a VBox and Accordion for a structured layout ---
# We use an Accordion to tuck away the more advanced settings, simplifying the UI.
advanced_settings = widgets.Accordion(
    children=[widgets.VBox([prior_selector, risk_slider])],
    selected_index=None # Start with the accordion closed
)
advanced_settings.set_title(0, 'Advanced Settings (Prior & Risk)')
advanced_settings.layout = wide_layout


# The main VBox organizes all the elements vertically and adds a border.
control_panel_layout = widgets.VBox([
    widgets.HTML("<h2 style='font-family: Arial, sans-serif;'>Step 1: Configure Your Test</h2>"),
    widgets.HTML("<b style='font-family: Arial, sans-serif;'>Upload your data file and adjust the settings below, then click Confirm.</b>"),
    uploader,
    advanced_settings,
    confirm_button,
    output_area
], layout=widgets.Layout(
    border='1px solid #ccc',
    padding='15px',
    border_radius='8px',
    margin='10px 0'
))

display(control_panel_layout)

#### 3. Data Loading and Validation
This cell handles the critical first step of loading and validating the A/B test data. The code will read the specified Excel file, confirm its existence, and verify that it contains the necessary columns for the analysis: variant, reach, and conversion. This ensures the data is correctly structured before proceeding.

In [None]:
# --- Define Expected Data Structure ---
REQUIRED_COLUMNS = ['variant', 'reach', 'conversion']

# --- Validation and Loading Logic ---
# First, check if the 'uploaded_data_content' variable exists.
# This variable is created only when you click "Confirm Selections" in the Control Panel.
if 'uploaded_data_content' not in locals():
    print("❌ Error: No data file has been confirmed yet.")
    print("   Please go back to the Control Panel, upload a file, and click 'Confirm Selections'.")

else:
    # If the data exists, try to load and validate it.
    try:
        print(f"Attempting to load data from '{confirmed_filename}'...")

        # Load the data from the in-memory variable into a pandas DataFrame.
        # We use 'uploaded_data_content' instead of a file path.
        df = pd.read_excel(uploaded_data_content)

        # Check if all the required columns are in the DataFrame.
        if all(col in df.columns for col in REQUIRED_COLUMNS):
            # If validation is successful, print a confirmation and the data's head.
            print("✅ Success: File loaded and validated.")
            print("\n--- First 5 Rows of the Dataset ---")
            print(df.head().to_string(index=False))
        else:
            # If columns are missing, identify them and report the error.
            missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns]
            print(f"---")
            print(f"❌ Error: Missing Columns.")
            print(f"The file '{confirmed_filename}' was loaded, but is missing required columns.")
            print(f"   - Missing column(s): {missing_cols}")
            print(f"   - Please ensure the file contains all of the following columns: {REQUIRED_COLUMNS}")

    except Exception as e:
        # Catch any other potential errors during the file reading process.
        print(f"---")
        print(f"❌ An unexpected error occurred while reading the file: {e}")


#### 4. Calculation of Posterior Parameters
Here, we perform the core Bayesian update for our A/B test. 🧪

In [None]:
# First, check if the DataFrame 'df' exists from the previous step.
if 'df' not in locals():
    print("❌ Error: DataFrame 'df' not found.")
    print("   Please run the 'Data Loading and Validation' cell successfully before proceeding.")
else:
    try:
        # Get the selected prior values (alpha, beta) from the Control Panel widget
        PRIOR_ALPHA, PRIOR_BETA = prior_selector.value

        # --- Apply the Beta-Binomial Conjugate Update Rule ---
        # posterior_alpha = prior_alpha + number_of_successes (conversions)
        # posterior_beta = prior_beta + number_of_failures (reach - conversions)
        df['posterior_alpha'] = PRIOR_ALPHA + df['conversion']
        df['posterior_beta'] = PRIOR_BETA + (df['reach'] - df['conversion'])

        # --- Display the Updated DataFrame ---
        # Show the DataFrame with the newly calculated posterior parameters.
        print("✅ Success: Posterior parameters calculated.")
        print("\n--- DataFrame with Updated Posterior Parameters ---")
        display_cols = ['variant', 'reach', 'conversion', 'posterior_alpha', 'posterior_beta']
        print(df[display_cols].to_string(index=False))

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")



#### 5. Generation of the Posterior Plot

This cell generates the most important visualization for our analysis: the posterior probability distributions. The ridgeline plot is used for better readability when comparing multiple variants.

In [None]:
# First, check if the DataFrame 'df' with posterior parameters exists.
if 'df' in locals() and 'posterior_alpha' in df.columns:
    try:
        # --- 1. Setup for Ridgeline Plot ---
        # We sort by the posterior mean to have a more organized plot.
        # The posterior mean is the average of the distribution.
        if 'posterior_mean' not in df.columns:
             df['posterior_mean'] = df['posterior_alpha'] / (df['posterior_alpha'] + df['posterior_beta'])

        sorted_df = df.sort_values('posterior_mean', ascending=False)

        # Create a figure and axes for the plot.
        fig, ax = plt.subplots(figsize=(12, 2 + len(sorted_df) * 0.7))

        # --- 1a. Create a color palette ---
        # Use a colormap to get a unique color for each variant.
        colors = plt.cm.viridis(np.linspace(0.1, 0.9, len(sorted_df)))

        # --- 2. Dynamically Determine X-axis Range ---
        # Ensure all distributions and their credible intervals are fully visible.
        max_x = 0
        for i, row in sorted_df.iterrows():
            percentile_999 = stats.beta.ppf(0.999, row['posterior_alpha'], row['posterior_beta'])
            if percentile_999 > max_x:
                max_x = percentile_999
        x = np.linspace(0, max_x * 1.05, 1000)

        # --- 3. Plot Each Variant as a Ridge ---
        y_offset_step = 0.8  # Controls vertical spacing between ridges

        for i, (row, color) in enumerate(zip(sorted_df.itertuples(), colors)):
            y_offset = i * y_offset_step

            # Calculate the Probability Density Function (PDF)
            pdf = stats.beta.pdf(x, row.posterior_alpha, row.posterior_beta)

            # Plot the main distribution curve with a label for the legend.
            ax.plot(x, pdf + y_offset, color=color, lw=1.5, label=row.variant)

            # Add a light fill for the entire distribution
            ax.fill_between(x, y_offset, pdf + y_offset, alpha=0.2, color=color)

            # --- 4. Calculate and Shade the 95% Credible Interval ---
            # This interval contains the true conversion rate with 95% probability.
            ci_low, ci_high = stats.beta.ppf([0.025, 0.975], row.posterior_alpha, row.posterior_beta)

            # Create a mask for the x-values within the credible interval
            ci_mask = (x >= ci_low) & (x <= ci_high)

            # Add a darker shade on top for the 95% credible interval
            ax.fill_between(x[ci_mask], y_offset, pdf[ci_mask] + y_offset, alpha=0.4, color=color)

        # --- 5. Finalize and Display the Plot ---
        from matplotlib.patches import Patch

        # Clean up the plot aesthetics
        ax.set_title('Posterior Distributions of Conversion Rates', fontsize=16)
        ax.set_xlabel('Conversion Rate', fontsize=12)
        ax.set_yticks([]) # Hide y-axis ticks as they are not meaningful here
        ax.spines['left'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)

        # Create and display a custom legend
        handles, labels = ax.get_legend_handles_labels()

        # Add "Variant" prefix to each label
        new_labels = [f'Variant {label}' for label in labels]

        # Create a proxy artist for the shaded area to include in the legend
        ci_patch = Patch(color='gray', alpha=0.4, label='95% Credible Interval')
        handles.append(ci_patch)
        new_labels.append('95% Credible Interval')

        # Position the legend above the plot in the top-right corner with a smaller font
        fig.legend(handles, new_labels, title="Legend", bbox_to_anchor=(0.98, 0.98), loc='upper right', fontsize='small')

        plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for the title and legend
        plt.show()

    except Exception as e:
        print(f"❌ An unexpected error occurred while generating the plot: {e}")
else:
    print("❌ Error: DataFrame 'df' with posterior parameters not found.")
    print("   Please run the previous cells successfully before proceeding.")


#### 6. # Monte Carlo Simulation

Run this cell to start the Monte Carlo Simulation and create 100.000 random samples.

In [None]:
# --- 1. Simulation Setup ---
# Define the number of random samples to generate for each variant's distribution.
# A larger number of samples leads to more stable and accurate estimates of our metrics.
N_SAMPLES = 100000

# We will store the generated samples in a dictionary, with variant names as keys.
posterior_samples = {}


# --- 2. Run Simulation ---
# Check if the dataframe 'df' with posterior parameters exists to avoid errors.
if 'df' in locals() and 'posterior_alpha' in df.columns:

    # Iterate over each variant (row) in the DataFrame.
    for i, row in df.iterrows():
        variant_name = row['variant']
        p_alpha = row['posterior_alpha']
        p_beta = row['posterior_beta']

        # Generate N_SAMPLES from the Beta distribution defined by the variant's
        # posterior parameters. Each sample represents a plausible "true"
        # conversion rate for that variant, according to our model.
        samples = stats.beta.rvs(a=p_alpha, b=p_beta, size=N_SAMPLES)

        # Store the resulting array of samples in our dictionary.
        posterior_samples[variant_name] = samples

    # --- 3. Output: Simulation Summary ---
    # The simulation is complete. The following is a summary of the process.
    print("--- Monte Carlo Simulation Summary ---")
    print(f"✅ Simulation completed successfully.")
    print(f"   - Samples generated per variant: {N_SAMPLES:,}")
    print(f"   - Variants simulated: {list(posterior_samples.keys())}")

    print("\nData Preview (first 3 samples for each variant):")
    for variant, samples in posterior_samples.items():
        preview = [round(s, 6) for s in samples[:3]]
        print(f"  - {variant}: {preview}")

    print("\nThe 'posterior_samples' dictionary is now ready for metric calculation in the next cell.")

else:
    # This message will only be displayed if the prerequisite DataFrame is not found.
    print("Error: DataFrame 'df' with posterior parameters not found.")
    print("Please ensure the data loading (Cell 5) and posterior calculation (Cell 7) were executed successfully.")

#### 7. Calculation and Presentation of Metrics
This is the final calculation step where we translate our simulation results into actionable business metrics. 🏆

In [None]:
# First, check if the simulation data from the previous step exists.
if 'posterior_samples' in locals():
    try:
        # --- Get the Risk Threshold from the Control Panel ---
        # This line was missing. It retrieves the value set by the designer.
        RISK_THRESHOLD = risk_slider.value

        # --- 1. Combine Simulation Samples into a DataFrame ---
        samples_df = pd.DataFrame(posterior_samples)
        samples_df['max_conversion_rate'] = samples_df.max(axis=1)
        variant_names = list(posterior_samples.keys())

        # --- 2. Calculate 'Probability to be Best' and 'Expected Loss' ---
        results = []
        for variant in variant_names:
            prob_best = (samples_df[variant] == samples_df['max_conversion_rate']).mean()
            loss = samples_df['max_conversion_rate'] - samples_df[variant]
            expected_loss = loss.mean()
            results.append({
                "Variant": variant,
                "Probability to be Best": prob_best,
                "Expected Loss (Risk)": expected_loss
            })

        # --- 3. Format and Display Results as a Table ---
        results_df = pd.DataFrame(results)

        # Create the 'Decision Guide' column based on the threshold
        conditions = [
            results_df['Expected Loss (Risk)'] > RISK_THRESHOLD,
            results_df['Expected Loss (Risk)'] < RISK_THRESHOLD
        ]
        choices = ['Above Threshold', 'Below Threshold']
        results_df['Decision Guide'] = np.select(conditions, choices, default='Equals Threshold')

        # Add other necessary columns from the main 'df' for the final report
        if 'df' in locals():
            results_df = pd.merge(results_df, df[['variant', 'posterior_mean']], left_on='Variant', right_on='variant', how='left').drop('variant', axis=1)

        # Sort the DataFrame by the lowest risk
        results_df = results_df.sort_values(by="Expected Loss (Risk)")

        # --- 4. Final Styling ---
        styled_df = results_df.style.format({
            "Probability to be Best": "{:.2%}",
            "Expected Loss (Risk)": "{:.4%}"
        }).set_properties(**{'text-align': 'center'}) \
        .set_caption(f"🏆 Bayesian A/B Test Results (Risk Threshold: {RISK_THRESHOLD:.1%})") \
        .hide(axis="index")

        # Display the final, styled table
        display(styled_df)
        print("\n(A variant with risk 'Below Threshold' is generally considered a safe choice.)")

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
else:
    print("❌ Error: The 'posterior_samples' dictionary was not found.")
    print("   Please ensure the Monte Carlo Simulation cell was executed successfully.")


#### 8. Automated Conclusion Logic
This final, automated cell translates our statistical results into a clear business recommendation with key metrics. 🎯

In [None]:
from IPython.display import display, Markdown
import pandas as pd

# First, check if the results_df DataFrame from the previous step exists.
if 'results_df' not in locals():
    print("❌ Error: The 'results_df' DataFrame was not found.")
    print("   Please run the 'Calculation and Presentation of Metrics' cell successfully before proceeding.")
else:
    try:
        # --- Get Risk Threshold from Control Panel ---
        RISK_THRESHOLD = risk_slider.value

        # --- Ensure all necessary data is in results_df for the report ---
        if 'df' in locals():
            required_cols = ['posterior_mean', 'reach', 'conversion']
            missing_cols = [col for col in required_cols if col not in results_df.columns]

            if missing_cols:
                cols_to_merge = ['variant'] + missing_cols
                if all(col in df.columns for col in cols_to_merge):
                    results_df = pd.merge(
                        results_df,
                        df[cols_to_merge],
                        left_on='Variant',
                        right_on='variant',
                        how='left'
                    ).drop(columns='variant', errors='ignore')

        # --- Extract Key Information ---
        best_candidate = results_df.iloc[0]
        is_winner = best_candidate['Expected Loss (Risk)'] < RISK_THRESHOLD

        # ------------------------------------------------------------------
        # Prepare list that will feed df_uplift (always created)
        uplift_rows = [{
            "Compared To": f"Variant '{best_candidate['Variant']}' (Winner)",
            "Expected Uplift": 0.0,
            "Conversion Gain": 0,
            "Total Expected": int(best_candidate['conversion'])
        }]
        # ------------------------------------------------------------------

        # --- Build the Markdown Report String ---
        markdown_report = ""

        # 1. Verdict
        if is_winner:
            markdown_report += f"## ✅ Verdict: Test Concluded. Deploy Variant '{best_candidate['Variant']}'.\n"
        else:
            markdown_report += f"## ⚠️ Verdict: Test Inconclusive. Collect More Data.\n"
        markdown_report += "---\n"

        # 2. Stakeholder Summary
        markdown_report += "### Summary for Stakeholders\n"
        if is_winner:
            summary_text = (
                f"The analysis confidently recommends deploying **Variant "
                f"'{best_candidate['Variant']}'**. It has the highest chance of being the "
                f"best option, and the risk of choosing it is well below our safety limit "
                f"of {RISK_THRESHOLD:.1%}. The table in the final section shows the expected "
                f"performance increase against all other variants."
            )
        else:
            summary_text = (
                "The results are not yet clear enough to make a confident decision. "
                "Our best option still has a risk level higher than our limit. "
                "We recommend collecting more data to get a clearer winner."
            )
        markdown_report += summary_text + "\n\n---\n"

        # 3. Key Metrics Explained
        markdown_report += "### Key Metrics Explained\n"
        prob_best_str = f"{best_candidate['Probability to be Best']:.1%}"
        markdown_report += (
            f"**🔹 Probability to be Best: {prob_best_str}**\n"
            f"   - *What it means:* This is the probability that Variant "
            f"'{best_candidate['Variant']}' is truly the best option among all variants "
            f"tested. A higher percentage means more confidence in it being the winner.\n\n"
        )
        risk_str = f"{best_candidate['Expected Loss (Risk)']:.4%}"
        markdown_report += (
            f"**🔹 Risk (Expected Loss): {risk_str}**\n"
            f"   - *What it means:* This is the 'cost of being wrong.' It represents the "
            f"average potential drop in conversion rate you would risk by choosing this "
            f"variant if another one was secretly better. A lower risk is better.\n\n"
        )
        markdown_report += (
            "**🔹 Expected Uplift**\n"
            "   - *What it means:* This is the expected percentage increase in the "
            "conversion rate of the winning variant compared to another. The table below "
            "shows this uplift and the **potential gain in conversions** for the same "
            "number of visitors.\n"
        )

        # 4. Expected Uplift Table
        required_cols_for_table = ['posterior_mean', 'reach', 'conversion']
        if is_winner and len(results_df) > 1 and all(col in results_df.columns for col in required_cols_for_table):
            markdown_report += "\n---\n"
            markdown_report += "### Expected Uplift vs. Other Variants\n"
            markdown_report += "| Compared To | Expected Uplift | Potential Conversion Gain | Total Expected Conversions |\n"
            markdown_report += "|:---|:---|:---|:---|\n"

            winner_conversions = best_candidate['conversion']
            markdown_report += (
                f"| **Variant '{best_candidate['Variant']}' (Winner)** | **-** | "
                f"**{int(winner_conversions)} conversions (actual)** | "
                f"**{int(winner_conversions)}** |\n"
            )

            for _, other_variant in results_df.iloc[1:].iterrows():
                uplift = (
                    (best_candidate['posterior_mean'] - other_variant['posterior_mean'])
                    / other_variant['posterior_mean']
                )
                conversion_gain = (
                    (best_candidate['posterior_mean'] - other_variant['posterior_mean'])
                    * best_candidate['reach']
                )
                total_expected = winner_conversions + conversion_gain

                # ---- add row to Markdown table ----
                markdown_report += (
                    f"| Variant '{other_variant['Variant']}' | +{uplift:.2%} | "
                    f"**+{int(round(conversion_gain, 0))}** more conversions | "
                    f"{int(round(total_expected, 0))} |\n"
                )

                # ---- save same values for df_uplift ----
                uplift_rows.append({
                    "Compared To": f"Variant '{other_variant['Variant']}'",
                    "Expected Uplift": round(float(uplift), 6),
                    "Conversion Gain": int(round(conversion_gain, 0)),
                    "Total Expected": int(round(total_expected, 0))
                })
            markdown_report += "\n"

        # --- Display Markdown report ---
        display(Markdown(markdown_report))

        # --- Create df_uplift DataFrame and (optionally) persist -----------------
        df_uplift = pd.DataFrame(uplift_rows)
        # df_uplift.to_csv('expected_uplift.csv', index=False)   # opcional
        # df_uplift.to_pickle('expected_uplift.pkl')             # opcional
        print("✅ df_uplift criado com", len(df_uplift), "linhas.")
        display(df_uplift)

    except Exception as e:
        print(f"❌ An unexpected error occurred while generating the report: {e}")


# Creating the report


### Project Configuration Panel

This cell launches a compact three‑step wizard that

1. **Validates your Gemini API key**,
2. **Lists compatible Gemini models for you to pick**, and
3. **Accepts a Markdown project file** while recording your preferred language.

Once the steps are complete, the file’s content is loaded into a DataFrame called `project_description_df`, making it immediately available to the rest of the notebook.


In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
import google.generativeai as genai
import os
import re

# --- Create Layouts and Output Areas ---
api_step_box = widgets.VBox()
model_step_box = widgets.VBox(layout={'display': 'none'})     # Hidden initially
upload_step_box = widgets.VBox(layout={'display': 'none'})    # Hidden initially
final_output_area = widgets.Output()

# --- Step 1: API Key Validation ---

api_key_input = widgets.Password(description='Chave de API Gemini:')
validate_api_button = widgets.Button(description="Validar Chave de API", button_style='info')

def on_validate_api_clicked(b):
    """Validates the API key and reveals the model selector."""
    with final_output_area:
        clear_output(wait=True)
        print("⏳ Validando Chave de API...")
        try:
            genai.configure(api_key=api_key_input.value)
            available_models = [
                m.name for m in genai.list_models()
                if 'generateContent' in m.supported_generation_methods
            ]
            filtered_models = [m for m in available_models if 'models/gemini' in m]

            if not filtered_models:
                print("❌ Erro: A Chave de API é válida, mas não foram encontrados modelos Gemini compatíveis.")
                return

            print("✅ Sucesso: Chave de API validada.")
            print(f"   - Encontrados {len(filtered_models)} modelos compatíveis.")

            model_selector.options = filtered_models
            api_step_box.layout.display = 'none'
            model_step_box.layout.display = 'flex'

        except Exception as e:
            print(f"❌ Erro: Não foi possível validar a Chave de API. Detalhes: {e}")

validate_api_button.on_click(on_validate_api_clicked)
api_step_box.children = [
    widgets.HTML("<h3>Passo 1: Configurar API</h3>"),
    api_key_input,
    validate_api_button
]

# --- Step 2: Model Selection ---

model_selector = widgets.Dropdown(description='Selecionar Modelo:')
confirm_model_button = widgets.Button(description="Confirmar Modelo", button_style='info')

def on_confirm_model_clicked(b):
    """Confirms the model choice and reveals the file uploader."""
    with final_output_area:
        clear_output(wait=True)
        print(f"✅ Modelo selecionado: {model_selector.value}")

    model_step_box.layout.display = 'none'
    upload_step_box.layout.display = 'flex'

confirm_model_button.on_click(on_confirm_model_clicked)
model_step_box.children = [
    widgets.HTML("<h3>Passo 2: Selecionar Modelo de IA</h3>"),
    model_selector,
    confirm_model_button
]

# --- Step 3: File Upload and DataFrame Creation ---

# Idioma (novo seletor)
language_selector = widgets.Dropdown(
    description='Idioma:',
    options=[
        'English',           # 1
        'Português',         # 2
        'Español',           # 3
        'Français',          # 4
        'Deutsch',           # 5
        'Italiano',          # 6
        '日本語',             # 7 – Japanese
        '中文',               # 8 – Chinese
        'हिन्दी',            # 9 – Hindi
        'العربية'            # 10 – Arabic
    ],
    value='English'
)

project_uploader = widgets.FileUpload(
    accept='.md',
    description='Carregar Ficheiro do Projeto'
)
create_df_button = widgets.Button(
    description="Carregar e Criar DataFrame",
    button_style='success'
)

def on_create_df_clicked(b):
    """Handles file upload, reads its content, and creates a DataFrame."""
    with final_output_area:
        clear_output(wait=True)
        print("⏳ Processando ficheiro carregado...")

        if not project_uploader.value:
            print("❌ Erro: Por favor, carregue um ficheiro.")
            return

        try:
            uploaded_file_dict = project_uploader.value[0]
            project_content = uploaded_file_dict['content'].tobytes().decode('utf-8')

            global project_description_df
            project_description_df = pd.DataFrame([{'content': project_content}])

            print("✅ Sucesso! O conteúdo do ficheiro do projeto foi lido e armazenado em `project_description_df`.")
            display(project_description_df)
            print("\n🚀 Tudo pronto. Pode prosseguir para a próxima célula.")
            upload_step_box.layout.display = 'none'

        except Exception as e:
            print(f"❌ Ocorreu um erro: {e}")

create_df_button.on_click(on_create_df_clicked)
upload_step_box.children = [
    widgets.HTML("<h3>Passo 3: Carregar Ficheiro do Projeto e Selecionar Idioma</h3>"),
    language_selector,
    project_uploader,
    create_df_button
]

# --- Display the Final Control Panel ---
full_panel = widgets.VBox([
    widgets.HTML("<h2 style='font-family: Arial, sans-serif;'>Configuração do Projeto</h2>"),
    api_step_box,
    model_step_box,
    upload_step_box,
    widgets.HTML("<hr>"),
    final_output_area
], layout={'border': '1px solid #ccc', 'padding': '15px', 'border_radius': '8px'})

display(full_panel)


### AI‑Powered Narrative Analysis

This cell brings together your project description and A/B test results, asks Gemini to craft a SCQA‑based story, and stores the slide‑by‑slide output in a DataFrame called `presentation_df` for the presentation builder that follows.


In [None]:
import pandas as pd
import json
import google.generativeai as genai
from IPython.display import display
import re

# This cell uses the LLM to analyze the test results and project context,
# then structures the output into a DataFrame that will be used to build
# the presentation in the next cell.

def analyze_and_create_dataframe():
    """
    Uses the LLM to perform SCQA analysis and generate a structured DataFrame
    for the presentation.
    """
    print("⏳ Starting AI-powered analysis...")

    # --- 1. Pre-flight Checks ---
    if 'results_df' not in globals():
        print("❌ Error: The 'results_df' DataFrame was not found. Please run the previous analysis cells first.")
        return
    if 'project_description_df' not in globals():
        print("❌ Error: 'project_description_df' not found. Please run the previous cell to upload the project file.")
        return
    if 'model_selector' not in globals() or model_selector.disabled or not model_selector.value.startswith('models/'):
        print("❌ Error: An AI model has not been selected. Please validate your API key in the panel above to select a model.")
        return

    print("✅ Pre-flight checks passed.")

    try:
        # --- 2. Gather All Information ---
        print("   - Reading project context from DataFrame...")
        project_context = project_description_df['content'].iloc[0]
        results_data_string = globals()['results_df'].to_markdown(index=False)

        # --- 3. Re-engineered Prompt for Structured Analysis ---
        prompt_instructions = """
        **Primary Goal**: Analyze the provided A/B test data and project context to create a structured presentation plan.

        **Your Task**:
        1.  **Synthesize the Narrative**: Read the `Project Context` and `A/B Test Data Results`. Formulate a coherent story using the SCQA (Situation, Complication, Question, Answer) framework.
        2.  **Populate the JSON Structure**: Based on your SCQA analysis, fill out the JSON object below.
        3.  **Output ONLY the JSON object.** Do not include any other text, markdown, or explanations.

        **Rules for Analysis**:
        * **Grounding is Mandatory**: Every value in the JSON MUST be derived directly from the provided context and data.
        * **No Hallucination**: DO NOT invent data. If a metric (e.g., revenue) is not in the source files, use an available metric (like 'conversions') or state it's not available.
        * **Plain Text Only**: The text in the "title" and "body" fields must be clean, plain text ready for a presentation. **Do not include any markdown formatting like `**`, `*`, `-`, or `###`.**
        * **Visual Data**: For the `visual_data` field, provide the specific numbers needed to create the chart.

        **JSON Output Structure (Follow these content instructions strictly)**:
        ```json
        [
          {
            "slide": 1,
            "title": "Our Context and the Key Question",
            "body": "For this slide's body, create a cohesive paragraph of plain text. Start by describing the factual Situation (including the baseline). Follow this by explaining the Complication (the problem or opportunity). Conclude by formulating the Key Question that needs to be answered. Do not use markdown formatting like 'Situation:' or '**'.",
            "visual_type": "delta_chart",
            "visual_data": {"from": 0.03, "to": 0.05, "label": "Performance Gap"}
          },
          {
            "slide": 2,
            "title": "Our Recommendation to Answer the Question",
            "body": "Present the direct, high-level answer that resolves the 'Key Question' from the previous slide. This is your main recommendation, the hypothesis that will be detailed in the following slides. Use plain text.",
            "visual_type": "icon",
            "visual_data": {"icon": "🏆"}
          },
          {
            "slide": 3,
            "title": "Variants Tested",
            "body": "List each variant with a brief description as provided in the project context. Present them as a clean, multi-line string, each variant on a new line. Do not use markdown bullet points.",
            "visual_type": "list",
            "visual_data": {}
          },
          {
            "slide": 4,
            "title": "Key Evidence: Why This Recommendation Works",
            "body": "Provide the top 2-3 evidence points that support your recommendation. Present them as a clean, multi-line string, with each point on a new line. Do not use markdown bullet points like '*' or '-'.",
            "visual_type": "comparison",
            "visual_data": {}
          },
          {
            "slide": 5,
            "title": "Risk Assessment of the Recommendation",
            "body": "Analyze the risk (Expected Loss) and compare it with the defined tolerance. Mention risks from the context that support the decision. Use plain text.",
            "visual_type": "gauge_meter",
            "visual_data": {"value": 0.0003, "threshold": 0.01}
          },
          {
            "slide": 6,
            "title": "Expected Business Impact",
            "body": "Translate the results into a tangible business outcome (e.g., conversion lift) that justifies the recommendation. Use plain text.",
            "visual_type": "waterfall_chart",
            "visual_data": {}
          },
          {
            "slide": 7,
            "title": "Immediate Action Plan",
            "body": "Provide 2-3 clear, actionable next steps to implement the 'Answer'. Present them as a clean, multi-line string, with each point on a new line. Do not use markdown bullet points.",
            "visual_type": "timeline",
            "visual_data": {}
          }
        ]
        ```
        """

        selected_language = language_selector.value
        selected_model_name = model_selector.value
        print(f"   - Using AI model: {selected_model_name} in {selected_language}")

        final_prompt = f"""
        **Source Material**

        Here is the context and data you must use for your analysis. You are forbidden from using any information not present here.

        **1. Project Context:**
        ---
        {project_context}
        ---

        **2. A/B Test Data Results:**
        ---
        {results_data_string}
        ---

        **Your Task**

        Now, using ONLY the source material provided above, perform the following task.

        **Language**: {selected_language}
        **Instructions**: {prompt_instructions}
        """

        # --- 4. Call the Gemini API ---
        print("   - Requesting analysis from Gemini API...")
        model = genai.GenerativeModel(selected_model_name)

        generation_config = genai.types.GenerationConfig(
            max_output_tokens=8192,
            response_mime_type="application/json"
        )

        response = model.generate_content(
            final_prompt,
            generation_config=generation_config
        )

        print("   - Received structured analysis from Gemini.")
        report_text = response.text

        # --- 5. Create and Display DataFrame ---
        slides_data = json.loads(report_text)

        global presentation_df
        presentation_df = pd.DataFrame(slides_data)

        print("\n✅ Success! AI analysis complete. The `presentation_df` is ready for the next cell.")
        print("--- Presentation Plan DataFrame (First 5 Rows) ---")
        display(presentation_df.head())

    except Exception as e:
        print(f"❌ An unexpected error occurred during AI analysis: {e}")
        if 'response' in locals():
            print("\n--- Raw AI Response for Debugging ---")
            print(response.text)

# --- Automatically run the analysis ---
analyze_and_create_dataframe()


### PowerPoint Slide Generation

This cell takes the structured plan in `presentation_df` and automatically builds a tidy 16:9 PowerPoint deck—adding titles, body text, and data‑driven visuals like delta charts and risk gauges—then saves the file to a `reports/` folder for easy download.


In [None]:
import pandas as pd
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_ALIGN
from datetime import datetime
import os
import matplotlib.pyplot as plt
import io

# This cell uses the 'presentation_df' as a blueprint to programmatically generate
# a PowerPoint presentation, including data‑driven visuals.

# --- Helper Functions for Chart Generation ---

def create_bar_chart(visual_data, output_buffer):
    """Generates a simple bar chart and saves it to the buffer."""
    label = visual_data.get('label', 'Label')
    value = visual_data.get('value', 0) * 100  # Convert to percentage

    fig, ax = plt.subplots(figsize=(4, 3))
    bars = ax.bar([label], [value], color='#4A90E2', width=0.5)
    ax.set_ylabel('Conversion Rate (%)')
    ax.set_ylim(0, max(10, value * 1.2))
    ax.set_yticks([])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

    for bar in bars:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.0,
                yval + 0.5,
                f'{yval:.1f}%',
                ha='center',
                va='bottom')

    plt.tight_layout()
    plt.savefig(output_buffer, format='png', dpi=150, transparent=True)
    plt.close(fig)


def create_delta_chart(visual_data, output_buffer):
    """Generates a two‑bar delta chart (from → to) and saves it to the buffer."""
    frm = visual_data.get('from', 0) * 100
    to = visual_data.get('to', 0) * 100
    label = visual_data.get('label', 'Delta')

    fig, ax = plt.subplots(figsize=(4, 3))
    bars = ax.bar(['From', 'To'], [frm, to],
                  color=['#4A90E2', '#F5A623'], width=0.5)
    ax.set_ylabel('Conversion Rate (%)')
    ax.set_ylim(0, max(10, to * 1.2))
    ax.set_title(label)
    ax.set_yticks([])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

    for bar in bars:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.0,
                yval + 0.5,
                f'{yval:.1f}%',
                ha='center',
                va='bottom')

    plt.tight_layout()
    plt.savefig(output_buffer, format='png', dpi=150, transparent=True)
    plt.close(fig)


def create_gauge_meter(visual_data, output_buffer):
    """Generates a risk gauge meter and saves it to the buffer."""
    value = visual_data.get('value', 0) * 100
    threshold = visual_data.get('threshold', 0.01) * 100

    fig, ax = plt.subplots(figsize=(4, 2.5))
    ax.set_xlim(0, threshold * 2)
    ax.set_ylim(0, 1)
    ax.set_xticks([])
    ax.set_yticks([])

    # Draw the gauge arc
    ax.add_patch(plt.Rectangle((0, 0), threshold, 0.2,
                               color='#4CAF50', alpha=0.7))  # Green (Safe)
    ax.add_patch(plt.Rectangle((threshold, 0), threshold, 0.2,
                               color='#F44336', alpha=0.7))  # Red (Risk)

    # Draw the pointer
    ax.arrow(value, 0.4, 0, -0.15,
             head_width=0.05 * threshold,
             head_length=0.05,
             fc='black',
             ec='black')
    ax.text(value, 0.45, f'Risk: {value:.3f}%',
            ha='center', fontsize=12, weight='bold')
    ax.text(threshold, -0.1, f'Threshold: {threshold:.1f}%',
            ha='center', fontsize=10)

    plt.box(False)
    plt.tight_layout()
    plt.savefig(output_buffer, format='png', dpi=150, transparent=True)
    plt.close(fig)

# --- Main Presentation Generation Function ---

def create_presentation_from_dataframe():
    """Loops through the presentation_df and builds a PowerPoint slide for each row."""
    print("⏳ Starting PowerPoint generation...")

    # --- 1. Pre-flight Check ---
    if 'presentation_df' not in globals() or not isinstance(presentation_df, pd.DataFrame):
        print("❌ Error: The 'presentation_df' DataFrame was not found.")
        print("   Please run the 'AI-Powered Narrative Analysis' cell successfully before proceeding.")
        return

    print(f"✅ Found presentation plan with {len(presentation_df)} slides.")

    try:
        # --- 2. Initialize Presentation ---
        prs = Presentation()
        prs.slide_width = Inches(10)
        prs.slide_height = Inches(5.625)

        # --- 3. Loop Through DataFrame and Create Slides ---
        print("   - Building slides from the DataFrame...")
        for _, row in presentation_df.iterrows():
            slide_title = row.get('title', 'No Title Provided')
            slide_body = row.get('body', '')
            visual_type = row.get('visual_type', 'none')
            visual_data = row.get('visual_data', {})

            slide_layout = prs.slide_layouts[6]  # Blank layout
            slide = prs.slides.add_slide(slide_layout)

            # --- Add Title ---
            title_shape = slide.shapes.add_textbox(
                Inches(0.5), Inches(0.2), Inches(9), Inches(0.75))
            title_frame = title_shape.text_frame
            title_frame.text = slide_title
            p = title_frame.paragraphs[0]
            p.font.bold = True
            p.font.size = Pt(28)
            title_frame.word_wrap = True

            # --- Add Body Content ---
            body_shape = slide.shapes.add_textbox(
                Inches(0.5), Inches(1.0), Inches(5.5), Inches(4.0))
            body_frame = body_shape.text_frame
            body_frame.text = slide_body
            p = body_frame.paragraphs[0]
            p.font.size = Pt(16)
            body_frame.word_wrap = True

            # --- Add Visuals ---
            buffer = io.BytesIO()
            if visual_type in ('bar_chart',):
                create_bar_chart(visual_data, buffer)
                slide.shapes.add_picture(
                    buffer, Inches(6.0), Inches(1.5), width=Inches(3.5))
            elif visual_type == 'delta_chart':
                create_delta_chart(visual_data, buffer)
                slide.shapes.add_picture(
                    buffer, Inches(6.0), Inches(1.5), width=Inches(3.5))
            elif visual_type == 'gauge_meter':
                create_gauge_meter(visual_data, buffer)
                slide.shapes.add_picture(
                    buffer, Inches(6.0), Inches(1.5), width=Inches(3.5))
            elif visual_type == 'icon':
                icon_shape = slide.shapes.add_textbox(
                    Inches(6.5), Inches(2.0), Inches(2.0), Inches(2.0))
                icon_frame = icon_shape.text_frame
                icon_frame.text = visual_data.get('icon', ' ')
                p = icon_frame.paragraphs[0]
                p.font.size = Pt(96)
                p.alignment = PP_ALIGN.CENTER
            # Visual types 'comparison', 'waterfall_chart', 'timeline', 'list'
            # are handled with text only for now.

        # --- 4. Save Presentation to 'reports' Folder ---
        reports_dir = 'reports'
        os.makedirs(reports_dir, exist_ok=True)

        report_filename = f"AI_Generated_Report_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pptx"
        report_path = os.path.join(reports_dir, report_filename)

        prs.save(report_path)
        print(f"\n✅ Success! Presentation saved to: '{report_path}'")

    except Exception as e:
        print(f"❌ An unexpected error occurred during presentation generation: {e}")

# --- Automatically run the presentation creation ---
create_presentation_from_dataframe()


# Chat


### LangChain Vector Store Build
This cell disables TensorFlow/Keras for clean transformer imports, validates the required DataFrames, converts each to `Document`s, chunks them, embeds with `all-MiniLM-L6-v2`, and stores everything in a local FAISS index (`vector_store/`) exposed as `vector_store` for the chat interface.


In [None]:
# --- Disable TensorFlow/Keras before importing transformers -------------------
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["USE_TF"] = "0"

# Ensure the required packages are installed (run once per environment)
# !pip install -q -U langchain-huggingface sentence-transformers

# --- LangChain Vector DB: tokenization and indexing of DataFrames -------------
# Prerequisite: df, results_df, df_uplift, presentation_df, project_description_df
# must already exist in memory.

import pandas as pd
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1. Validation ----------------------------------------------------------------
required_dfs = {
    "df":                       "Raw data (XLSX)",
    "results_df":               "Overall results",
    "df_uplift":                "Expected Uplift vs. Other Variants",
    "presentation_df":          "Presentation data",
    "project_description_df":   "Project description"
}
missing = [name for name in required_dfs if name not in globals()]
if missing:
    raise ValueError(f"The following DataFrames are missing in the session: {missing}")

# 2. Convert each DataFrame into a single Document -----------------------------
docs = []
for name, label in required_dfs.items():
    frame = globals()[name]
    df_as_text = frame.to_csv(index=False)
    full_text = f"### DataFrame: {name} – {label}\n{df_as_text}"
    docs.append(
        Document(
            page_content=full_text,
            metadata={"source_df": name, "description": label}
        )
    )

# 3. Chunking ------------------------------------------------------------------
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = splitter.split_documents(docs)

# 4. Embeddings ----------------------------------------------------------------
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 5. Vector Store --------------------------------------------------------------
vector_db = FAISS.from_documents(split_docs, embeddings)
vector_store = vector_db           # <-- alias required by the chat interface
vector_db.save_local("vector_store")

print("✅ Vector store created in 'vector_store/'.")
print("   To reload later:")
print("   from langchain.vectorstores import FAISS")
print("   vec = FAISS.load_local('vector_store', embeddings)")
print("   vector_store = vec   # make it visible to the chat function")


### Gemini Chat Model Selector
This cell lists the Gemini models available to your API key, lets you pick one via a dropdown, and—on confirmation—stores the choice in `chat_model_name` for use by later chat cells.


In [None]:
# --- Select a Gemini model for chat ------------------------------------------
import ipywidgets as widgets
from IPython.display import display, clear_output
import google.generativeai as genai

# Fetch compatible Gemini models in your account
available_models = [
    m.name for m in genai.list_models()
    if "generateContent" in m.supported_generation_methods
]
gemini_models = [m for m in available_models if "models/gemini" in m]

if not gemini_models:
    raise RuntimeError("❌ No compatible Gemini models found in your account.")

# Dropdown: choose the model
chat_model_selector = widgets.Dropdown(
    options=gemini_models,
    value=gemini_models[0],
    description="Chat model:"
)

# Confirm button
confirm_chat_model_button = widgets.Button(
    description="Use this model",
    button_style="info"
)

# Output area for feedback
chat_selector_output = widgets.Output()

def on_confirm_chat_model_clicked(_):
    """Store the chosen model and show visual feedback."""
    with chat_selector_output:
        clear_output(wait=True)
        print(f"✅ Chat model selected: {chat_model_selector.value}")
    # Make the model name globally available for later cells
    globals()["chat_model_name"] = chat_model_selector.value

confirm_chat_model_button.on_click(on_confirm_chat_model_clicked)

# Display the UI
display(
    widgets.VBox(
        [
            widgets.HTML("<h3>Select a Model for the Chat</h3>"),
            chat_model_selector,
            confirm_chat_model_button,
            chat_selector_output,
        ],
        layout={"border": "1px solid #ccc", "padding": "10px", "border_radius": "6px"},
    )
)


### Prompt & Basic Config Setup
This cell defines the system prompt that disambiguates “conversion” vs. “conversion rate” and configures Gemini (model name + generation settings). It ensures later chat calls use only notebook‑scoped context and report the correct metric explicitly.


In [None]:
# --- 1. Prompt --------------------------------------

import google.generativeai as genai

SYSTEM_PROMPT = """
### 1. Persona e Objetivo Principal
- Você é um analista de dados especialista, focado em um projeto de teste A/B.
- Seu objetivo principal é responder a perguntas usando os dados fornecidos sobre o projeto.
- Você pode e deve realizar cálculos matemáticos (somas, médias, percentuais, etc.) usando os números encontrados nos dados de contexto para responder às perguntas do usuário.

### 2. Contexto e Regras Fundamentais
- **Fonte da Verdade:** Baseie TODAS as suas respostas e cálculos estritamente nos números e informações encontradas nos chunks de texto da seção "Notebook Context".
- **Regra para Informação Ausente:** Se os dados necessários para uma resposta ou cálculo não estiverem no contexto, afirme claramente qual informação está faltando. Não invente números.

### 3. Lógica e Processo de Raciocínio
- **Definições de Termos:** Siga rigorosamente estas definições:
  - 'conversão' ou 'conversões': Refere-se ao número absoluto de conversões (geralmente da coluna 'conversion').
  - 'taxa de conversão', 'taxa' ou 'rate': Refere-se à métrica de proporção/probabilidade (geralmente de colunas como 'posterior_mean').
- **Regra para Lidar com Ambiguidade:** Se o usuário perguntar por "conversão" sem especificar "taxa", sua resposta deve, por padrão, ser sobre o número absoluto.
- **Regra de Suposição para Ambiguidade Persistente:** Se uma pergunta do usuário, mesmo depois de reescrita, permanecer ambígua (ex: "compare as variantes"), você deve primeiro declarar a suposição que está fazendo para poder respondê-la. Não responda diretamente sem antes validar sua premissa.

### 4. Formato de Saída
- Responda sempre em português brasileiro.
- Ao apresentar um dado numérico, sempre especifique a métrica que ele representa.
- **Mostre seu trabalho:** Quando realizar um cálculo, explique as etapas de forma simples para que o usuário possa entender seu raciocínio. Exemplo: "Para calcular o total, somei as conversões de todas as variantes (A+B+C+D), que resultou em X. O percentual da variante D é (587 / X) * 100, que é Y%."
- **Formato da Suposição:** Ao declarar uma suposição, sua resposta deve ser APENAS a pergunta de validação. Exemplo: "Sua pergunta é um pouco ambígua. Estou supondo que você quer uma comparação das taxas de conversão. É isso mesmo que você deseja?"
"""

# --- Guias de Estilo para os Modos de Conversa ---
STANDARD_STYLE_GUIDE = """
### Guia de Estilo de Resposta (Modo Padrão):
- Responda de forma direta e informativa, como se estivesse em uma apresentação de resultados para uma audiência geral.
- Foque na informação principal. NÃO mencione fontes de dados como "Chunk 3" ou nomes técnicos de colunas como 'posterior_mean'.
- A linguagem deve ser natural e limpa.
"""

ANALYST_STYLE_GUIDE = """
### Guia de Estilo de Resposta (Modo Analista):
- Responda como se estivesse conversando com um colega analista de dados.
- Seja detalhista. É encorajado e esperado que você cite as fontes dos dados (ex: "No DataFrame df_uplift...") e os nomes das colunas/métricas (ex: 'posterior_mean', 'uplift').
- A transparência sobre a metodologia e a origem dos dados é mais importante que a brevidade.
"""

REPHRASE_QUESTION_PROMPT = """Dada a seguinte conversa e uma pergunta de acompanhamento, reformule a pergunta de acompanhamento para ser uma pergunta autônoma que possa ser compreendida sem o histórico da conversa.

Histórico da Conversa:
{chat_history}

Pergunta de Acompanhamento:
{question}

Pergunta Autônoma:
"""

INTENT_CLASSIFICATION_PROMPT = """Sua tarefa é classificar a intenção da mensagem do usuário em uma de duas categorias. Responda com uma única palavra: QUERY ou CHITCHAT.

QUERY: O usuário está fazendo uma pergunta, pedindo dados, cálculos ou uma análise.
CHITCHAT: O usuário está fazendo um cumprimento, agradecimento, despedida ou uma conversa casual não relacionada a dados.

---
Exemplos:

Mensagem do Usuário: qual a taxa de conversão da variante C?
Sua Classificação: QUERY

Mensagem do Usuário: muito obrigado!
Sua Classificação: CHITCHAT

Mensagem do Usuário: compare os resultados
Sua Classificação: QUERY

Mensagem do Usuário: olá, tudo bem?
Sua Classificação: CHITCHAT

Mensagem do Usuário: gracias
Sua Classificação: CHITCHAT
---

Classifique a seguinte mensagem:

Mensagem do Usuário: {user_message}
Sua Classificação:
"""

# Nome do modelo e configuração de geração
model_name = globals().get("chat_model_name", "models/gemini-1.5-pro-latest")
generation_config = genai.GenerationConfig(max_output_tokens=2048)

### Chatbot Functions & Interface Setup
This cell imports the necessary libraries, instantiates the Gemini model using your previously selected configuration, and defines helper routines to:

1. Retrieve the top context chunks from your FAISS vector store,
2. Disambiguate whether the user is asking for conversion counts or rates,
3. Build the full system + context + user prompt, and
4. Call the LLM to generate a response.

It then wires everything up into a Gradio `ChatInterface` titled “Gemini Assistant — RAG (Vector Store)” and launches it for interactive querying.

In [None]:
# --- 2. Funções, chatbot e interface ---------------------------------------
import re
import gradio as gr
from IPython.display import HTML, display
import pandas as pd
from typing import List

# Cria o modelo Gemini usando as configs da 1ª célula
chat_model = genai.GenerativeModel(model_name)

# Variável para controlar o modo da conversa (STANDARD ou FULL_VIEW)
chat_mode = "STANDARD"

# ... (funções _classify_intent, _generate_standalone_question, etc. permanecem inalteradas) ...
def _classify_intent(user_message: str) -> str:
    prompt = INTENT_CLASSIFICATION_PROMPT.format(user_message=user_message)
    try:
        response = chat_model.generate_content(prompt)
        classification = response.text.strip().upper()
        if classification == "CHITCHAT":
            return "CHITCHAT"
        else:
            return "QUERY"
    except Exception as e:
        print(f"⚠️ Erro ao classificar intenção: {e}")
        return "QUERY"

def _generate_standalone_question(question: str, history: list) -> str:
    if not history:
        return question
    formatted_history = ""
    for user_msg, bot_msg in history:
        formatted_history += f"Usuário: {user_msg}\nAssistente: {bot_msg}\n"
    prompt = REPHRASE_QUESTION_PROMPT.format(chat_history=formatted_history, question=question)
    try:
        response = chat_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print(f"⚠️ Erro ao gerar pergunta autônoma: {e}")
        return question

def _format_chat_history(history: list, num_turns: int = 2) -> str:
    if not history:
        return ""
    recent_history = history[-num_turns:]
    formatted_history = "### Histórico da Conversa Recente:\n"
    for user_msg, bot_msg in recent_history:
        formatted_history += f"Usuário: {user_msg}\nAssistente: {bot_msg}\n"
    return formatted_history.strip()

def _docs_to_context(docs: List):
    return "\n\n".join(
        f"```chunk {i} | source_df={d.metadata.get('source_df')}\n{d.page_content}\n```"
        for i, d in enumerate(docs, 1)
    ) or "<no relevant context>"


def gemini_chat(message: str, history: list):
    global chat_mode

    if message.strip() == "#full_view":
        chat_mode = "FULL_VIEW"
        yield "✅ Modo Analista (`full_view`) ativado. As próximas respostas incluirão fontes e detalhes técnicos."
        return
    elif message.strip() == "#standard_mode":
        chat_mode = "STANDARD"
        yield "✅ Modo Padrão (`standard_mode`) ativado. As próximas respostas serão diretas e informativas."
        return

    intent = _classify_intent(message)
    print(f"--- DEBUG: Modo Atual: {chat_mode} | Intenção Detectada: {intent} ---")

    if intent == "CHITCHAT":
        try:
            chitchat_prompt = f"Você é um assistente prestativo. O usuário disse: '{message}'. Responda de forma breve e educada."
            resp = chat_model.generate_content(chitchat_prompt)
            yield resp.text.strip()
            return
        except Exception as e:
            yield f"⚠️ Erro ao gerar resposta de chitchat: {e}"
            return
    else: # A intenção é 'QUERY'
        if "vector_store" not in globals():
            yield "❌ vector_store not found."
            return

        standalone_question = _generate_standalone_question(message, history)
        print(f"--- DEBUG: Pergunta Reescreita: '{standalone_question}' ---")

        try:
            docs_scores = vector_store.similarity_search_with_score(standalone_question, k=8)
            docs = [d for d, _ in docs_scores]
        except Exception:
            docs = vector_store.similarity_search(standalone_question, k=8)
        except Exception as e:
            yield f"<error querying vector_store: {e}>"
            return

        chat_history_for_prompt = _format_chat_history(history)
        notebook_context = _docs_to_context(docs)
        metric_hint = "auto"

        # --- INÍCIO DA NOVA LÓGICA DESTA ETAPA ---
        # Seleciona o guia de estilo com base no modo atual
        if chat_mode == "FULL_VIEW":
            style_guide = ANALYST_STYLE_GUIDE
        else: # O padrão é STANDARD
            style_guide = STANDARD_STYLE_GUIDE

        # Monta o prompt final dinamicamente
        prompt = (
            f"{SYSTEM_PROMPT}\n\n"
            f"{style_guide}\n\n"  # <-- Guia de estilo dinâmico inserido aqui
            f"{chat_history_for_prompt}\n\n"
            f"### Documentos de Contexto (Resultados da Busca):\n{notebook_context}\n\n"
            f"Metric hint: {metric_hint}\n\n"
            f"### Pergunta a ser Respondida:\n{standalone_question}"
        )
        # --- FIM DA NOVA LÓGICA ---

        try:
            resp = chat_model.generate_content(prompt, generation_config=generation_config, stream=False)
            if resp.candidates and resp.candidates[0].content.parts:
                response_text = resp.candidates[0].content.parts[0].text.strip()
                if "É isso mesmo que você deseja?" in response_text:
                    yield response_text
                else:
                    yield response_text
            else:
                fr_code = "N/A"
                if resp.candidates:
                    fr_code = resp.candidates[0].finish_reason.name
                yield f"⚠️ O modelo não produziu texto (finish_reason={fr_code})."
        except Exception as e:
            yield f"⚠️ Erro ao chamar o Gemini: {e}"

# --- 3. Bloco de Interface (CÓDIGO FALTANTE) -------------------------------
# Adicione este bloco para criar e executar a interface conversacional

chat_ui = gr.ChatInterface(
    fn=gemini_chat,
    title="Gemini Assistant com Modos de Análise",
    description="Faça perguntas sobre o contexto. Use #full_view ou #standard_mode para alterar o estilo da resposta.",
    examples=[
        ["Qual foi a taxa de conversão do último trimestre?"],
        ["#full_view"],
        ["#standard_mode"]
    ],
    chatbot=gr.Chatbot(height=500),
    retry_btn=None,
    undo_btn="Desfazer",
    clear_btn="Limpar"
).queue()

# Para iniciar a interface no notebook ou script
# O debug=True é útil para ver os logs (como os prints) no terminal.
chat_ui.launch(debug=True, share=False)