In [3]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from bertopic import BERTopic
from umap import UMAP
import plotly.express as px
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
from IPython.display import Markdown
import concurrent.futures

# Load environment variables
load_dotenv()


def get_embedding(text):
    """Generate embedding for a given text using OpenAI API."""
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large"
    )
    return response.data[0].embedding

def safe_get_embedding(text):
    """Safely get embedding with error handling."""
    try:
        return get_embedding(text)
    except Exception as e:
        print(f"Error embedding document: {e}")
        return None

def get_embeddings_multithreaded(documents):
    """Get embeddings for a list of documents using multithreading."""
    with ThreadPoolExecutor(max_workers=25) as executor:
        embeddings = list(tqdm(executor.map(safe_get_embedding, documents), total=len(documents)))
    return [emb for emb in embeddings if emb is not None]

# Load dataset
df = pd.read_parquet('../extraction/outputs/dwarkesh_patel__leopold_aschenbrenner.parquet')
display(df.head())


Unnamed: 0,hypothesis_id,belief_id,chunk_id,chunk,meta_chunk,belief,type,context,justification,confidence,verify,verify_explanation,verification_focus,hypothesis,explanation,potential_sources
0,0,1,1,Dwarkesh Patel\nIn the Sholto and Trenton epis...,Dwarkesh Patel\nToday I’m chatting with my fri...,AI development has transitioned from a softwar...,positive,Leopold Aschenbrenner describes the evolution ...,The development of AI models now requires the ...,high,True,"While the belief reflects a current trend, the...",Investigate the historical data on AI training...,The rate of growth in AI training compute has ...,This hypothesis directly tests Aschenbrenner's...,[Published research papers on AI training comp...
1,1,1,1,Dwarkesh Patel\nIn the Sholto and Trenton epis...,Dwarkesh Patel\nToday I’m chatting with my fri...,AI development has transitioned from a softwar...,positive,Leopold Aschenbrenner describes the evolution ...,The development of AI models now requires the ...,high,True,"While the belief reflects a current trend, the...",Investigate the historical data on AI training...,The cost of building and operating large-scale...,If the cost of building and operating AI clust...,[Financial reports from companies investing in...
2,2,1,1,Dwarkesh Patel\nIn the Sholto and Trenton epis...,Dwarkesh Patel\nToday I’m chatting with my fri...,AI development has transitioned from a softwar...,positive,Leopold Aschenbrenner describes the evolution ...,The development of AI models now requires the ...,high,True,"While the belief reflects a current trend, the...",Investigate the historical data on AI training...,The availability of renewable energy resources...,Aschenbrenner's prediction of a trillion-dolla...,[Reports on renewable energy deployment and ca...
3,3,1,1,Dwarkesh Patel\nIn the Sholto and Trenton epis...,Dwarkesh Patel\nToday I’m chatting with my fri...,AI development has transitioned from a softwar...,positive,Leopold Aschenbrenner describes the evolution ...,The development of AI models now requires the ...,high,True,"While the belief reflects a current trend, the...",Investigate the historical data on AI training...,The development of more efficient AI algorithm...,If advancements in AI technology lead to more ...,[Research papers on AI algorithm efficiency im...
4,4,2,1,Dwarkesh Patel\nIn the Sholto and Trenton epis...,Dwarkesh Patel\nToday I’m chatting with my fri...,The computing power required for training larg...,positive,Leopold Aschenbrenner discusses the trend of i...,He mentions the doubling of training compute e...,high,True,This belief presents a significant claim about...,Examine the validity of the projected energy c...,If the rate of growth of AI training compute p...,The belief is based on the assumption that the...,"[Reports on AI hardware advancements, Data on ..."


In [4]:

# Extract unique beliefs and hypotheses
unique_beliefs_df = df[['belief', 'type', 'confidence']].drop_duplicates().reset_index(drop=True)
unique_hypotheses_df = df[['hypothesis', 'explanation', 'potential_sources']].explode('potential_sources').drop_duplicates().reset_index(drop=True)

# Sample documents for BERTopic analysis
documents = unique_hypotheses_df['potential_sources'].tolist()
documents_sample = pd.Series(documents).tolist()

# Embed the documents using multithreading
# embeddings = get_embeddings_multithreaded(documents_sample)
embeddings = pd.read_parquet('outputs/embeddings.parquet')


In [5]:
# Convert embeddings to numpy array
embeddings_np = np.array(embeddings)

# Define the expanded grid for UMAP parameters
umap_n_neighbors_range = [50, 200]
umap_n_components_range = [2, 4]

best_config = None
min_outliers = float('inf')
results = []


In [6]:

# Function to count outliers based on topic size
def count_outliers(topics):
    return sum(1 for t in topics if t == -1)

# Function to run the analysis with different parameters
def run_topic_modeling(documents_sample, embeddings_np, config):
    # Reduce dimensionality of the embeddings using UMAP for clustering
    umap_embeddings = UMAP(n_neighbors=config['UMAP_N_NEIGHBORS_CLUSTER'], 
                           n_components=config['UMAP_N_COMPONENTS_CLUSTER'], 
                           metric=config['UMAP_METRIC_CLUSTER']).fit_transform(embeddings_np)

    # Initialize BERTopic model
    topic_model = BERTopic(nr_topics=config['BERTOPIC_NR_TOPICS'], 
                           calculate_probabilities=config['BERTOPIC_CALCULATE_PROBABILITIES'],
                           min_topic_size=config['BERTOPIC_MIN_TOPIC_SIZE'],
                           verbose=config['BERTOPIC_VERBOSE'])

    # Fit the model on the reduced embeddings and get topics
    topics, probabilities = topic_model.fit_transform(documents_sample, umap_embeddings)

    # Count outliers
    num_outliers = count_outliers(topics)
    return topics, probabilities, umap_embeddings, num_outliers, topic_model, config


In [7]:
# Perform grid search with progress bar
total_iterations = len(umap_n_neighbors_range) * len(umap_n_components_range)
with tqdm(total=total_iterations, desc="Grid Search") as pbar:
    for n_neighbors in umap_n_neighbors_range:
        for n_components in umap_n_components_range:
            config = {
                'UMAP_N_NEIGHBORS_CLUSTER': n_neighbors,
                'UMAP_N_COMPONENTS_CLUSTER': n_components,
                'UMAP_METRIC_CLUSTER': 'cosine',
                'UMAP_N_NEIGHBORS_VIS': 50,
                'UMAP_N_COMPONENTS_VIS': 2,
                'UMAP_METRIC_VIS': 'cosine',
                'BERTOPIC_NR_TOPICS': 4,
                'BERTOPIC_MIN_TOPIC_SIZE': 20,
                'BERTOPIC_CALCULATE_PROBABILITIES': True,
                'BERTOPIC_VERBOSE': True
            }

            topics, probabilities, umap_embeddings, num_outliers, topic_model, config = run_topic_modeling(documents_sample, embeddings_np, config)
            results.append((config['UMAP_N_NEIGHBORS_CLUSTER'], config['UMAP_N_COMPONENTS_CLUSTER'], num_outliers))

            if num_outliers < min_outliers:
                min_outliers = num_outliers
                best_config = config
                best_topics = topics
                best_probabilities = probabilities
                best_umap_embeddings = umap_embeddings
                best_topic_model = topic_model

            pbar.update(1)

print(f"Best configuration: {best_config}")
print(f"Minimum outliers: {min_outliers}")

Grid Search:   0%|          | 0/4 [00:00<?, ?it/s]

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-06-10 02:36:56,117 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-10 02:36:57,766 - BERTopic - Dimensionality - Completed ✓
2024-06-10 02:36:57,767 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-10 02:36:57,803 - BERTopic - Cluster - Completed ✓
2024-06-10 02:36:57,804 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-10 02:36:57,822 - BERTopic - Representation - Completed ✓
2024-06-10 02:36:57,823 - BERTopic - Topic reduction - Reducing number of topics
2024-06-10 02:36:57,844 - BERTopic - Topic reduction - Reduced number of topics from 16 to 4
2024-06-10 02:37:00,884 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-10 02:37:02,336 - BERTopic - Dimensionality - Completed ✓
2024-06-10 02:37:02,337 - BERTopic - Cluster - Start clustering the 

Best configuration: {'UMAP_N_NEIGHBORS_CLUSTER': 50, 'UMAP_N_COMPONENTS_CLUSTER': 2, 'UMAP_METRIC_CLUSTER': 'cosine', 'UMAP_N_NEIGHBORS_VIS': 50, 'UMAP_N_COMPONENTS_VIS': 2, 'UMAP_METRIC_VIS': 'cosine', 'BERTOPIC_NR_TOPICS': 4, 'BERTOPIC_MIN_TOPIC_SIZE': 20, 'BERTOPIC_CALCULATE_PROBABILITIES': True, 'BERTOPIC_VERBOSE': True}
Minimum outliers: 0


In [8]:

# Further reduce dimensionality for visualization and plot all configurations
for n_neighbors, n_components, _ in results:
    config = {
        'UMAP_N_NEIGHBORS_CLUSTER': n_neighbors,
        'UMAP_N_COMPONENTS_CLUSTER': n_components,
        'UMAP_METRIC_CLUSTER': 'cosine',
        'UMAP_N_NEIGHBORS_VIS': 50,
        'UMAP_N_COMPONENTS_VIS': 2,
        'UMAP_METRIC_VIS': 'cosine',
        'BERTOPIC_NR_TOPICS': 4,
        'BERTOPIC_MIN_TOPIC_SIZE': 20,
        'BERTOPIC_CALCULATE_PROBABILITIES': True,
        'BERTOPIC_VERBOSE': True
    }
    
    umap_embeddings_2d = UMAP(n_neighbors=config['UMAP_N_NEIGHBORS_VIS'], 
                              n_components=config['UMAP_N_COMPONENTS_VIS'], 
                              metric=config['UMAP_METRIC_VIS']).fit_transform(best_umap_embeddings)

    # Create a DataFrame for visualization
    visualization_df = pd.DataFrame({
        'UMAP1': umap_embeddings_2d[:, 0],
        'UMAP2': umap_embeddings_2d[:, 1],
        'Topic': best_topics,
        'Document': documents_sample
    })

    # Plot the clusters using Plotly
    fig = px.scatter(visualization_df, x='UMAP1', y='UMAP2', color='Topic',
                     title=f'UMAP Visualization of BERTopic Clusters (n_neighbors={n_neighbors}, n_components={n_components})',
                     labels={'UMAP1': 'UMAP Dimension 1', 'UMAP2': 'UMAP Dimension 2'},
                     hover_data=['Topic', 'Document'])

    fig.show()


In [9]:

# Display topics in markdown format
topic_info = best_topic_model.get_topic_info()
topic_md = ""
for idx, topic in topic_info.iterrows():
    topic_md += f"### Topic {topic['Topic']}\n"
    topic_md += f"**Count:** {topic['Count']}\n"
    topic_md += f"**Name:** {topic['Name']}\n"
    topic_md += f"**Representation:** {', '.join(topic['Representation'])}\n"
    topic_md += f"**Representative Docs:** {', '.join(topic['Representative_Docs'])}\n\n"

display(Markdown(topic_md))


### Topic 0
**Count:** 469
**Name:** 0_ai_on_and_research
**Representation:** ai, on, and, research, of, reports, in, papers, the, industry
**Representative Docs:** Research papers on chain-of-thought reasoning in AI, Analysis of trends in AI research and development, Research papers and industry reports on the development and progress of RL and self-play techniques.

### Topic 1
**Count:** 200
**Name:** 1_expert_opinions_the_on
**Representation:** expert, opinions, the, on, of, ai, and, future, researchers, impact
**Representative Docs:** Expert opinions on the future of AI development, Expert opinions on the future of AI development, Expert opinions on the future of AI research

### Topic 2
**Count:** 110
**Name:** 2_of_case_studies_energy
**Representation:** of, case, studies, energy, ai, data, in, on, tasks, models
**Representative Docs:** Studies on the impact of renewable energy on the AI industry, Case studies on the integration of AI models in various sectors, Analysis of case studies on successful AI implementations in various industries

### Topic 3
**Count:** 106
**Name:** 3_and_on_ai_reports
**Representation:** and, on, ai, reports, publications, national, research, government, security, in
**Representative Docs:** Peer-reviewed research papers on robotics and AI, Publications on AI research and development, News articles and research publications on AI in national security



In [10]:

# Extract values for plotting
neighbor_values = [result[0] for result in results]
component_values = [result[1] for result in results]
outlier_counts = [result[2] for result in results]

# Create a DataFrame for plotting
plot_df = pd.DataFrame({
    'UMAP_n_neighbors': neighbor_values,
    'UMAP_n_components': component_values,
    'Outliers': outlier_counts
})


In [11]:

# Plotting the number of outliers for different parameter combinations using Plotly Express
fig = px.scatter(
    plot_df,
    x='UMAP_n_neighbors',
    y='UMAP_n_components',
    color='Outliers',
    size='Outliers',
    color_continuous_scale='Viridis',
    title='Outliers for Different UMAP Parameter Combinations',
    labels={'UMAP_n_neighbors': 'UMAP n_neighbors', 'UMAP_n_components': 'UMAP n_components', 'Outliers': 'Number of Outliers'},
    template='plotly_white'
)

fig.update_layout(coloraxis_colorbar=dict(title='Number of Outliers'))
fig.show()
