In [None]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from bertopic import BERTopic
from umap import UMAP
import plotly.express as px
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
from IPython.display import Markdown
import concurrent.futures

# Load environment variables
load_dotenv()



def get_embedding(text):
    """Generate embedding for a given text using OpenAI API."""
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large"
    )
    return response.data[0].embedding

def safe_get_embedding(text):
    """Safely get embedding with error handling."""
    try:
        return get_embedding(text)
    except Exception as e:
        print(f"Error embedding document: {e}")
        return None

def get_embeddings_multithreaded(documents):
    """Get embeddings for a list of documents using multithreading."""
    with ThreadPoolExecutor(max_workers=25) as executor:
        embeddings = list(tqdm(executor.map(safe_get_embedding, documents), total=len(documents)))
    return [emb for emb in embeddings if emb is not None]

# Load dataset
df = pd.read_parquet('../extraction/outputs/dwarkesh_patel__leopold_aschenbrenner.parquet')
display(df.head())


# Extract unique beliefs and hypotheses
unique_beliefs_df = df[['belief', 'type', 'confidence']].drop_duplicates().reset_index(drop=True)
unique_hypotheses_df = df[['hypothesis', 'explanation', 'potential_sources']].explode('potential_sources').drop_duplicates().reset_index(drop=True)

# Sample documents for BERTopic analysis
documents = unique_hypotheses_df['potential_sources'].tolist()
documents_sample = pd.Series(documents).tolist()

# Embed the documents using multithreading
embeddings = get_embeddings_multithreaded(documents_sample)
# embeddings = pd.read_parquet('outputs/embeddings.parquet')