In [None]:
from typing import Dict, Any
import json
import time
import random 

def read_json(file_path: str) -> Dict[str, Any]:
    with open(file_path, encoding="utf8") as f:
        return json.load(f)

def sleep_random(min, range):
    time.sleep(min + random.random() * range)


In [None]:
from google import genai

config = read_json('../gemini_config10.json')
API_KEY = config["api_key"]
client = genai.Client(api_key=API_KEY)

# result = client.models.embed_content(
#         model="gemini-embedding-exp-03-07",
#         contents="What is the meaning of life?")

# print(result.embeddings[0].values)

In [None]:
import os
import pandas as pd

# Recap agenda
source_dir = "../dataset/recap_agenda_title"
transcript_dir = "../dataset/AMI_MS_Cleaned"
dirs = os.listdir(source_dir)

# Prepare data storage
data = {
    "Item": [],
    "Embedding_Vector": []
}

for id, item in enumerate(dirs):
    try:
        path = os.path.join(os.getcwd(), source_dir, item)
        jsondict = read_json(path)
        agenda = jsondict["agenda"]

        result = client.models.embed_content(
            model="gemini-embedding-exp-03-07",
            contents=agenda)
        print(result.embeddings[0].values)

        # Store results
        data["Item"].append(item)
        data["Embedding_Vector"].append(result.embeddings[0].values)
        sleep_random(15, 20)

        # Create DataFrame
        df = pd.DataFrame(data)

        # Define file paths
        output_dir = "example_outputs/gemini"
        os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist
        new_output_path = os.path.join(output_dir, f"embeddings_{id}.csv")
        old_output_path = os.path.join(output_dir, f"embeddings_{id-1}.csv") if id > 0 else None

        # Save new CSV
        df.to_csv(new_output_path, index=False)
        print(f"Embeddings saved to {new_output_path}")

        # Remove old file if it exists and new file was created successfully
        if old_output_path and os.path.exists(old_output_path) and os.path.exists(new_output_path):
            os.remove(old_output_path)
            print(f"Removed old file: {old_output_path}")

    except Exception as e:
        logger.exception("Error occurred with file: %s", item)

# Display first few rows
print("\nFirst few rows of the saved data:")
print(df.head())

In [None]:
import os
import pandas as pd

# Recap agenda
source_dir = "../dataset/recap_agenda_title"
transcript_dir = "../dataset/AMI_MS_Cleaned"
dirs = os.listdir(source_dir)

# Prepare data storage
data = {
    "Item": [],
    "Embedding_Vector": []
}

related_docs_dir = f'../dataset/truncated_single_input_agenda'

for id, item in enumerate(dirs):
    try:
        related_docs_path = os.path.join(os.getcwd(), related_docs_dir, item)
        related_docs_dict = read_json(related_docs_path)
        related_docs = related_docs_dict["truncate_shared_docs"]

        result = client.models.embed_content(
            model="gemini-embedding-exp-03-07",
            contents=related_docs)
        print(result.embeddings[0].values)

        # Store results
        data["Item"].append(item)
        data["Embedding_Vector"].append(result.embeddings[0].values)
        sleep_random(15, 20)

        # Create DataFrame
        df = pd.DataFrame(data)

        # Define file paths
        output_dir = "example_outputs/gemini"
        os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist
        new_output_path = os.path.join(output_dir, f"shared_docs_embeddings_{id}.csv")
        old_output_path = os.path.join(output_dir, f"shared_docs_embeddings_{id-1}.csv") if id > 0 else None

        # Save new CSV
        df.to_csv(new_output_path, index=False)
        print(f"Embeddings saved to {new_output_path}")

        # Remove old file if it exists and new file was created successfully
        if old_output_path and os.path.exists(old_output_path) and os.path.exists(new_output_path):
            os.remove(old_output_path)
            print(f"Removed old file: {old_output_path}")

    except Exception as e:
        logger.exception("Error occurred with file: %s", item)

# Display first few rows
print("\nFirst few rows of the saved data:")
print(df.head())

In [None]:
transcript


In [None]:
agenda

In [None]:
related_docs

In [None]:
recap_embeddings = recap_agenda_embeddings["Item"]
recap_embeddings

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd

# Load the data
recap_agenda_embeddings = pd.read_csv("example_outputs/gemini/recap_agenda_embeddings.csv")
shared_docs_embeddings = pd.read_csv("example_outputs/gemini/shared_docs_embeddings.csv")
transcript_embeddings = pd.read_csv("example_outputs/gemini/transcript_embeddings.csv")

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    # Evaluate the string to a list and convert to numpy array
    return np.array(eval(embedding_str), dtype=float)

# Apply conversion to each embedding column
recap_embeddings = np.stack(recap_agenda_embeddings["Embedding_Vector"].apply(convert_embedding).values)
shared_embeddings = np.stack(shared_docs_embeddings["Embedding_Vector"].apply(convert_embedding).values)
transcript_embeddings = np.stack(transcript_embeddings["Embedding_Vector"].apply(convert_embedding).values)

# Combine all embeddings for PCA
all_embeddings = np.vstack((recap_embeddings, shared_embeddings, transcript_embeddings))

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(all_embeddings)

# Split the results back into their respective groups
n_recap = len(recap_embeddings)
n_shared = len(shared_embeddings)

recap_pca = pca_result[:n_recap]
shared_pca = pca_result[n_recap:n_recap + n_shared]
transcript_pca = pca_result[n_recap + n_shared:]

# Create scatter plot
plt.figure(figsize=(10, 8))
plt.scatter(recap_pca[:, 0], recap_pca[:, 1], c='blue', label='Recap Agenda', alpha=0.6)
plt.scatter(shared_pca[:, 0], shared_pca[:, 1], c='red', label='Shared Docs', alpha=0.6)
plt.scatter(transcript_pca[:, 0], transcript_pca[:, 1], c='green', label='Transcript', alpha=0.6)

# Add labels and title
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA Visualization of Embeddings')
plt.legend()

# Add grid
plt.grid(True, linestyle='--', alpha=0.7)

# Show plot
plt.show()

# Print explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.4f}")

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd

# Load the data
recap_agenda_embeddings = pd.read_csv("example_outputs/gemini/recap_agenda_embeddings.csv")
shared_docs_embeddings = pd.read_csv("example_outputs/gemini/shared_docs_embeddings.csv")
transcript_embeddings = pd.read_csv("example_outputs/gemini/transcript_embeddings.csv")

# Function to filter items ending with 'a' (before .json)
def filter_type_a(df):
    # Extract the last character before '.json' and check if it's 'a'
    df['Type'] = df['Item'].str.extract(r'(\w)\.json$')  # Extracts the character before '.json'
    return df[df['Type'] == 'a']

# Apply the filter to each dataframe
recap_agenda_filtered = filter_type_a(recap_agenda_embeddings)
shared_docs_filtered = filter_type_a(shared_docs_embeddings)
transcript_filtered = filter_type_a(transcript_embeddings)

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    return np.array(eval(embedding_str), dtype=float)

# Apply conversion to each filtered embedding column
# Check if the filtered dataframe is not empty before stacking
recap_embeddings = np.stack(recap_agenda_filtered["Embedding_Vector"].apply(convert_embedding).values) if not recap_agenda_filtered.empty else np.array([])
shared_embeddings = np.stack(shared_docs_filtered["Embedding_Vector"].apply(convert_embedding).values) if not shared_docs_filtered.empty else np.array([])
transcript_embeddings = np.stack(transcript_filtered["Embedding_Vector"].apply(convert_embedding).values) if not transcript_filtered.empty else np.array([])

# Combine all embeddings for PCA (only include non-empty arrays)
all_embeddings_list = [emb for emb in [recap_embeddings, shared_embeddings, transcript_embeddings] if emb.size > 0]
if not all_embeddings_list:
    print("No embeddings found after filtering for type 'a'.")
else:
    all_embeddings = np.vstack(all_embeddings_list)

    # Apply PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(all_embeddings)

    # Split the results back into their respective groups
    n_recap = len(recap_embeddings) if recap_embeddings.size > 0 else 0
    n_shared = len(shared_embeddings) if shared_embeddings.size > 0 else 0

    recap_pca = pca_result[:n_recap] if n_recap > 0 else np.array([])
    shared_pca = pca_result[n_recap:n_recap + n_shared] if n_shared > 0 else np.array([])
    transcript_pca = pca_result[n_recap + n_shared:] if (len(all_embeddings) - n_recap - n_shared) > 0 else np.array([])

    # Create scatter plot
    plt.figure(figsize=(10, 8))
    if recap_pca.size > 0:
        plt.scatter(recap_pca[:, 0], recap_pca[:, 1], c='blue', label='Recap Agenda (Type a)', alpha=0.6)
    if shared_pca.size > 0:
        plt.scatter(shared_pca[:, 0], shared_pca[:, 1], c='red', label='Shared Docs (Type a)', alpha=0.6)
    if transcript_pca.size > 0:
        plt.scatter(transcript_pca[:, 0], transcript_pca[:, 1], c='green', label='Transcript (Type a)', alpha=0.6)

    # Add labels and title
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('PCA Visualization of Embeddings (Type a Only)')
    plt.legend()

    # Add grid
    plt.grid(True, linestyle='--', alpha=0.7)

    # Show plot
    plt.show()

    # Print explained variance ratio
    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
    print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.4f}")

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd

# Load the data
recap_agenda_embeddings = pd.read_csv("example_outputs/gemini/recap_agenda_embeddings.csv")
shared_docs_embeddings = pd.read_csv("example_outputs/gemini/shared_docs_embeddings.csv")
transcript_embeddings = pd.read_csv("example_outputs/gemini/transcript_embeddings.csv")

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    return np.array(eval(embedding_str), dtype=float)

# Apply conversion to each embedding column
recap_embeddings = np.stack(recap_agenda_embeddings["Embedding_Vector"].apply(convert_embedding).values)
shared_embeddings = np.stack(shared_docs_embeddings["Embedding_Vector"].apply(convert_embedding).values)
transcript_embeddings = np.stack(transcript_embeddings["Embedding_Vector"].apply(convert_embedding).values)

# Combine all embeddings for PCA
all_embeddings = np.vstack((recap_embeddings, shared_embeddings, transcript_embeddings))

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(all_embeddings)

# Split the results back into their respective groups
n_recap = len(recap_embeddings)
n_shared = len(shared_embeddings)

recap_pca = pca_result[:n_recap]
shared_pca = pca_result[n_recap:n_recap + n_shared]
transcript_pca = pca_result[n_recap + n_shared:]

# Create scatter plot with index labels
plt.figure(figsize=(12, 10))
plt.scatter(recap_pca[:, 0], recap_pca[:, 1], c='blue', label='Recap Agenda', alpha=0.6)
plt.scatter(shared_pca[:, 0], shared_pca[:, 1], c='red', label='Shared Docs', alpha=0.6)
plt.scatter(transcript_pca[:, 0], transcript_pca[:, 1], c='green', label='Transcript', alpha=0.6)

# Add index labels
for i, (x, y) in enumerate(recap_pca):
    plt.annotate(str(i), (x, y), xytext=(5, 5), textcoords='offset points', fontsize=8, alpha=0.7, color='blue')
for i, (x, y) in enumerate(shared_pca):
    plt.annotate(str(i), (x, y), xytext=(5, 5), textcoords='offset points', fontsize=8, alpha=0.7, color='red')
for i, (x, y) in enumerate(transcript_pca):
    plt.annotate(str(i), (x, y), xytext=(5, 5), textcoords='offset points', fontsize=8, alpha=0.7, color='green')

# Add labels and title
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA Visualization of Embeddings with Array Index Labels')
plt.legend()

# Add grid
plt.grid(True, linestyle='--', alpha=0.7)

# Show plot
plt.show()

# Print explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.4f}")

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd

# Load the data
recap_agenda_embeddings = pd.read_csv("example_outputs/gemini/base_agenda_embeddings.csv")
shared_docs_embeddings = pd.read_csv("example_outputs/gemini/shared_docs_embeddings.csv")
transcript_embeddings = pd.read_csv("example_outputs/gemini/transcript_embeddings.csv")

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    # Evaluate the string to a list and convert to numpy array
    return np.array(eval(embedding_str), dtype=float)

# Apply conversion to each embedding column
recap_embeddings = np.stack(recap_agenda_embeddings["Embedding_Vector"].apply(convert_embedding).values)
shared_embeddings = np.stack(shared_docs_embeddings["Embedding_Vector"].apply(convert_embedding).values)
transcript_embeddings = np.stack(transcript_embeddings["Embedding_Vector"].apply(convert_embedding).values)

# Combine all embeddings for PCA
all_embeddings = np.vstack((recap_embeddings, shared_embeddings, transcript_embeddings))

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(all_embeddings)

# Split the results back into their respective groups
n_recap = len(recap_embeddings)
n_shared = len(shared_embeddings)

recap_pca = pca_result[:n_recap]
shared_pca = pca_result[n_recap:n_recap + n_shared]
transcript_pca = pca_result[n_recap + n_shared:]

# Create scatter plot
plt.figure(figsize=(10, 8))
plt.scatter(recap_pca[:, 0], recap_pca[:, 1], c='blue', label='Base Agenda', alpha=0.6)
plt.scatter(shared_pca[:, 0], shared_pca[:, 1], c='red', label='Shared Docs', alpha=0.6)
plt.scatter(transcript_pca[:, 0], transcript_pca[:, 1], c='green', label='Transcript', alpha=0.6)

# Add labels and title
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA Visualization of Embeddings')
plt.legend()

# Add grid
plt.grid(True, linestyle='--', alpha=0.7)

# Show plot
plt.show()

# Print explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.4f}")

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd

# Load all the embedding files
file_names = [
    "base_agenda_embeddings.csv",
    "cat_agenda_embeddings.csv",
    "only_cat_agenda_embeddings.csv",
    "rag_agenda_embeddings.csv",
    "rag_cat_agenda_embeddings.csv",
    "recap_agenda_embeddings.csv",
    "shared_docs_embeddings.csv",
    "template_agenda_embeddings.csv",
    "transcript_embeddings.csv"
]

dataframes = {}
for file in file_names:
    dataframes[file] = pd.read_csv(f"example_outputs/gemini/{file}")

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    return np.array(eval(embedding_str), dtype=float)

# Process each dataframe and store embeddings
embeddings_dict = {}
for file, df in dataframes.items():
    embeddings_dict[file] = np.stack(df["Embedding_Vector"].apply(convert_embedding).values)

# Combine all embeddings for PCA
all_embeddings = np.vstack(list(embeddings_dict.values()))

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(all_embeddings)

# Split the results back into their respective groups
pca_results = {}
start_idx = 0
for file, embeddings in embeddings_dict.items():
    n_samples = len(embeddings)
    pca_results[file] = pca_result[start_idx:start_idx + n_samples]
    start_idx += n_samples

# Create scatter plot
plt.figure(figsize=(12, 10))
colors = ['blue', 'red', 'green', 'purple', 'orange', 'cyan', 'magenta', 'yellow', 'black']
for (file, pca_data), color in zip(pca_results.items(), colors):
    label = file.replace('.csv', '').replace('_embeddings', '')
    plt.scatter(pca_data[:, 0], pca_data[:, 1], c=color, label=label, alpha=0.6)

# Add labels and title
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA Visualization of All Embeddings')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Add grid
plt.grid(True, linestyle='--', alpha=0.7)

# Adjust layout to prevent legend overlap
plt.tight_layout()

# Show plot
plt.show()

# Print explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.4f}")

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd

# Load all the embedding files
file_names = [
    "base_agenda_embeddings.csv",
    "cat_agenda_embeddings.csv",
    "only_cat_agenda_embeddings.csv",
    "rag_agenda_embeddings.csv",
    "rag_cat_agenda_embeddings.csv",
    "recap_agenda_embeddings.csv",
    "shared_docs_embeddings.csv",
    "template_agenda_embeddings.csv",
    "transcript_embeddings.csv"
]

dataframes = {}
for file in file_names:
    dataframes[file] = pd.read_csv(f"example_outputs/gemini/{file}")

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    return np.array(eval(embedding_str), dtype=float)

# Process each dataframe and store embeddings
embeddings_dict = {}
for file, df in dataframes.items():
    embeddings_dict[file] = np.stack(df["Embedding_Vector"].apply(convert_embedding).values)

# Combine all embeddings for t-SNE
all_embeddings = np.vstack(list(embeddings_dict.values()))

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=300)
tsne_result = tsne.fit_transform(all_embeddings)

# Split the results back into their respective groups
tsne_results = {}
start_idx = 0
for file, embeddings in embeddings_dict.items():
    n_samples = len(embeddings)
    tsne_results[file] = tsne_result[start_idx:start_idx + n_samples]
    start_idx += n_samples

# Create scatter plot
plt.figure(figsize=(12, 10))
colors = ['blue', 'red', 'green', 'purple', 'orange', 'cyan', 'magenta', 'yellow', 'black']
for (file, tsne_data), color in zip(tsne_results.items(), colors):
    label = file.replace('.csv', '').replace('_embeddings', '')
    plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=color, label=label, alpha=0.6)

# Add labels and title
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE Visualization of All Embeddings')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Add grid
plt.grid(True, linestyle='--', alpha=0.7)

# Adjust layout to prevent legend overlap
plt.tight_layout()

# Show plot
plt.show()

# Print KL divergence (a measure of how well t-SNE preserves the structure)
print(f"KL Divergence: {tsne.kl_divergence_:.4f}")

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd

# Load all the embedding files
file_names = [
    "base_agenda_embeddings.csv",
    "cat_agenda_embeddings.csv",
    "only_cat_agenda_embeddings.csv",
    "rag_agenda_embeddings.csv",
    "rag_cat_agenda_embeddings.csv",
    "recap_agenda_embeddings.csv",
    "shared_docs_embeddings.csv",
    "template_agenda_embeddings.csv",
    "transcript_embeddings.csv"
]

# Load and filter dataframes for type 'a'
dataframes = {}
for file in file_names:
    df = pd.read_csv(f"example_outputs/gemini/{file}")
    # Filter for items where the last character before '.json' is 'a'
    df['Type'] = df['Item'].str.extract(r'(\w)\.json$')  # Extract the character before '.json'
    dataframes[file] = df[df['Type'] == 'a']

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    return np.array(eval(embedding_str), dtype=float)

# Process each dataframe and store embeddings (only for non-empty dataframes)
embeddings_dict = {}
for file, df in dataframes.items():
    if not df.empty:  # Only process if the filtered dataframe has data
        embeddings_dict[file] = np.stack(df["Embedding_Vector"].apply(convert_embedding).values)
    else:
        embeddings_dict[file] = np.array([])  # Empty array for empty dataframes

# Combine all embeddings for t-SNE (only include non-empty arrays)
all_embeddings_list = [emb for emb in embeddings_dict.values() if emb.size > 0]
if not all_embeddings_list:
    print("No embeddings found after filtering for type 'a'.")
else:
    all_embeddings = np.vstack(all_embeddings_list)

    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=300)
    tsne_result = tsne.fit_transform(all_embeddings)

    # Split the results back into their respective groups
    tsne_results = {}
    start_idx = 0
    for file, embeddings in embeddings_dict.items():
        if embeddings.size > 0:  # Only include non-empty embeddings
            n_samples = len(embeddings)
            tsne_results[file] = tsne_result[start_idx:start_idx + n_samples]
            start_idx += n_samples
        else:
            tsne_results[file] = np.array([])

    # Create scatter plot
    plt.figure(figsize=(12, 10))
    colors = ['blue', 'red', 'green', 'purple', 'orange', 'cyan', 'magenta', 'yellow', 'black']
    for (file, tsne_data), color in zip(tsne_results.items(), colors):
        if tsne_data.size > 0:  # Only plot if there are points to show
            label = file.replace('.csv', '').replace('_embeddings', '') + " (Type a)"
            plt.scatter(tsne_data[:, 0], tsne_data[:, 1], c=color, label=label, alpha=0.6)

    # Add labels and title
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.title('t-SNE Visualization of Type a Embeddings')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    # Add grid
    plt.grid(True, linestyle='--', alpha=0.7)

    # Adjust layout to prevent legend overlap
    plt.tight_layout()

    # Show plot
    plt.show()

    # Print KL divergence (a measure of how well t-SNE preserves the structure)
    print(f"KL Divergence: {tsne.kl_divergence_:.4f}")

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load all the embedding files
file_names = [
    "base_agenda_embeddings.csv",
    "cat_agenda_embeddings.csv",
    "only_cat_agenda_embeddings.csv",
    "rag_agenda_embeddings.csv",
    "rag_cat_agenda_embeddings.csv",
    "recap_agenda_embeddings.csv",
    "shared_docs_embeddings.csv",
    "template_agenda_embeddings.csv",
    "transcript_embeddings.csv"
]

# Load and filter dataframes for type 'a'
dataframes = {}
for file in file_names:
    df = pd.read_csv(f"example_outputs/gemini/{file}")
    df['Type'] = df['Item'].str.extract(r'(\w)\.json$')
    dataframes[file] = df[df['Type'] == 'a']

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    return np.array(eval(embedding_str), dtype=float)

# Process each dataframe and store embeddings
embeddings_dict = {}
for file, df in dataframes.items():
    if not df.empty:
        embeddings_dict[file] = np.stack(df["Embedding_Vector"].apply(convert_embedding).values)
    else:
        embeddings_dict[file] = np.array([])

# Combine all embeddings for t-SNE
all_embeddings_list = [emb for emb in embeddings_dict.values() if emb.size > 0]
if not all_embeddings_list:
    print("No embeddings found after filtering for type 'a'.")
else:
    all_embeddings = np.vstack(all_embeddings_list)

    # Apply t-SNE with 3 components
    tsne = TSNE(n_components=3, random_state=42, perplexity=30, max_iter=300)
    tsne_result = tsne.fit_transform(all_embeddings)

    # Split the results back into their respective groups
    tsne_results = {}
    start_idx = 0
    labels = []
    points = []
    
    for file, embeddings in embeddings_dict.items():
        if embeddings.size > 0:
            n_samples = len(embeddings)
            tsne_results[file] = tsne_result[start_idx:start_idx + n_samples]
            labels.extend([file.replace('.csv', '').replace('_embeddings', '')] * n_samples)
            points.extend(tsne_result[start_idx:start_idx + n_samples])
            start_idx += n_samples
        
    # Convert to dataframe for Plotly
    tsne_df = pd.DataFrame(points, columns=["x", "y", "z"])
    tsne_df["label"] = labels
    
    # Create 3D scatter plot
    fig = px.scatter_3d(tsne_df, x="x", y="y", z="z", color="label", title="3D t-SNE Visualization of Type a Embeddings")
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=40))
    
    # Show interactive plot
    fig.show()
    
    # Print KL divergence
    print(f"KL Divergence: {tsne.kl_divergence_:.4f}")


In [None]:
%pip show | grep "nbformat"

In [None]:
%pip install nbformat

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load all the embedding files
file_names = [
    "base_agenda_embeddings.csv",
    "cat_agenda_embeddings.csv",
    "only_cat_agenda_embeddings.csv",
    "rag_agenda_embeddings.csv",
    "rag_cat_agenda_embeddings.csv",
    "recap_agenda_embeddings.csv",
    "shared_docs_embeddings.csv",
    "template_agenda_embeddings.csv",
    "transcript_embeddings.csv"
]

# Load and filter dataframes for type 'a'
dataframes = {}
for file in file_names:
    df = pd.read_csv(f"example_outputs/gemini/{file}")
    df['Type'] = df['Item'].str.extract(r'(\w)\.json$')
    dataframes[file] = df[df['Type'] == 'a']

# Convert string representations of embeddings to numpy arrays
def convert_embedding(embedding_str):
    return np.array(eval(embedding_str), dtype=float)

# Process each dataframe and store embeddings
embeddings_dict = {}
for file, df in dataframes.items():
    if not df.empty:
        embeddings_dict[file] = np.stack(df["Embedding_Vector"].apply(convert_embedding).values)
    else:
        embeddings_dict[file] = np.array([])

# Combine all embeddings for t-SNE
all_embeddings_list = [emb for emb in embeddings_dict.values() if emb.size > 0]
if not all_embeddings_list:
    print("No embeddings found after filtering for type 'a'.")
else:
    all_embeddings = np.vstack(all_embeddings_list)

    # Apply t-SNE with 3 components
    tsne = TSNE(n_components=3, random_state=42, perplexity=30, max_iter=300)
    tsne_result = tsne.fit_transform(all_embeddings)

    # Split the results back into their respective groups
    tsne_results = {}
    start_idx = 0
    labels = []
    points = []
    
    for file, embeddings in embeddings_dict.items():
        if embeddings.size > 0:
            n_samples = len(embeddings)
            tsne_results[file] = tsne_result[start_idx:start_idx + n_samples]
            labels.extend([file.replace('.csv', '').replace('_embeddings', '')] * n_samples)
            points.extend(tsne_result[start_idx:start_idx + n_samples])
            start_idx += n_samples
        
    # Convert to dataframe for Plotly
    tsne_df = pd.DataFrame(points, columns=["x", "y", "z"])
    tsne_df["label"] = labels
    
    # Create 3D scatter plot
    fig = px.scatter_3d(tsne_df, x="x", y="y", z="z", color="label", 
                         title="3D t-SNE Visualization of Type a Embeddings", 
                         opacity=0.9)
    
    # Improve zoom and navigation
    fig.update_traces(marker=dict(size=5, opacity=0.9))  # Increase point size further
    fig.update_layout(
        scene=dict(
            xaxis_title='t-SNE Component 1',
            yaxis_title='t-SNE Component 2',
            zaxis_title='t-SNE Component 3',
            camera=dict(eye=dict(x=-1, y=-0.5, z=-1))  # 5x zoom enhancement
        ),
        margin=dict(l=0, r=0, b=0, t=40)
    )
    
    # Show interactive plot
    fig.show()
    
    # Print KL divergence
    print(f"KL Divergence: {tsne.kl_divergence_:.4f}")
